package ws.palladian.extraction.location;

import com.aliasi.util.Strings;
import edu.stanford.nlp.classify.LinearClassifier;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.jdesktop.swingx.JXLabel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.extraction.entity.ContextAnnotation;
import ws.palladian.extraction.entity.ContextTagger;
import ws.palladian.extraction.entity.StringTagger;
import ws.palladian.extraction.entity.WindowSizeContextTagger;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.processing.Tagger;
import ws.palladian.processing.features.Annotation;
import ws.palladian.processing.features.ImmutableAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/EntityPreprocessingTagger.class */
public class EntityPreprocessingTagger implements Tagger {
    private static final Logger LOGGER = LoggerFactory.getLogger(EntityPreprocessingTagger.class);
    private static final double LOWERCASE_THRESHOLD = 1.75d;
    private static final int CONTEXT_LENGTH = 5;
    public static final String SPLIT_ANNOTATION_TAG = "PARTIAL_CANDIDATE";
    private final ContextTagger tagger;
    private final Map<String, Double> caseDictionary;
    private final int longAnnotationSplit;

    public EntityPreprocessingTagger() {
        this(0);
    }

    public EntityPreprocessingTagger(int i) {
        this.tagger = new WindowSizeContextTagger(StringTagger.PATTERN, StringTagger.CANDIDATE_TAG, 5);
        InputStream inputStream = null;
        try {
            inputStream = EntityPreprocessingTagger.class.getResourceAsStream("/caseDictionary.csv");
            this.caseDictionary = loadCaseDictionary(inputStream);
            FileHelper.close(inputStream);
            this.longAnnotationSplit = i;
        } catch (Throwable th) {
            FileHelper.close(inputStream);
            throw th;
        }
    }

    private static final Map<String, Double> loadCaseDictionary(InputStream inputStream) {
        final HashMap newHashMap = CollectionHelper.newHashMap();
        FileHelper.performActionOnEveryLine(inputStream, new LineAction() { // from class: ws.palladian.extraction.location.EntityPreprocessingTagger.1
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str, int i) {
                String[] split = str.split(LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
                newHashMap.put(split[0], Double.valueOf(Double.valueOf(split[1]).doubleValue() / Double.valueOf(split[2]).doubleValue()));
            }
        });
        return newHashMap;
    }

    @Override // ws.palladian.processing.Tagger
    public List<Annotation> getAnnotations(String str) {
        List<ContextAnnotation> annotations = this.tagger.getAnnotations(str);
        List<Annotation> newArrayList = CollectionHelper.newArrayList();
        Set<String> inSentenceCandidates = getInSentenceCandidates(annotations);
        for (ContextAnnotation contextAnnotation : annotations) {
            String value = contextAnnotation.getValue();
            if (isWithinSentence(contextAnnotation)) {
                newArrayList.add(contextAnnotation);
            } else if (inSentenceCandidates.contains(value)) {
                LOGGER.trace("Skip '{}', because it appears within a sentence", value);
                newArrayList.add(contextAnnotation);
            } else {
                String[] split = value.split("\\s");
                if (split.length == 1) {
                    double lowercaseRatio = getLowercaseRatio(value);
                    if (lowercaseRatio > LOWERCASE_THRESHOLD) {
                        LOGGER.debug("Drop '{}' because of lc/uc ratio of {}", value, Double.valueOf(lowercaseRatio));
                    } else {
                        newArrayList.add(contextAnnotation);
                    }
                } else {
                    LOGGER.trace("Start correcting '{}'", value);
                    int i = 0;
                    String str2 = value;
                    int length = split.length;
                    int i2 = 0;
                    while (true) {
                        if (i2 >= length) {
                            break;
                        }
                        String str3 = split[i2];
                        double lowercaseRatio2 = getLowercaseRatio(str3);
                        if (lowercaseRatio2 <= LOWERCASE_THRESHOLD) {
                            LOGGER.trace("Stop correcting '{}' at '{}' because of lc/uc ratio of {}", value, str2, Double.valueOf(lowercaseRatio2));
                            break;
                        }
                        i += str3.length() + 1;
                        if (i >= value.length()) {
                            break;
                        }
                        str2 = value.substring(i);
                        if (inSentenceCandidates.contains(str2)) {
                            LOGGER.trace("Stop correcting '{}' as '{}' is contained within sentence", value, str2);
                            break;
                        }
                        i2++;
                    }
                    if (i >= value.length()) {
                        LOGGER.debug("Drop '{}' completely because of lc/uc ratio", value);
                    } else if (i > 0) {
                        LOGGER.debug("Correct '{}' to '{}' because of lc/uc ratios", value, str2);
                        newArrayList.add(new ImmutableAnnotation(contextAnnotation.getStartPosition() + i, str2, contextAnnotation.getTag()));
                    } else {
                        newArrayList.add(contextAnnotation);
                    }
                }
            }
        }
        LOGGER.debug("Reduced from {} to {} with with case dictionary", Integer.valueOf(annotations.size()), Integer.valueOf(newArrayList.size()));
        if (this.longAnnotationSplit > 0) {
            List<Annotation> longAnnotationSplit = getLongAnnotationSplit(newArrayList, this.longAnnotationSplit);
            LOGGER.debug("Extracted additional {} annotations by splitting", Integer.valueOf(longAnnotationSplit.size()));
            newArrayList.addAll(longAnnotationSplit);
        }
        return newArrayList;
    }

    List<Annotation> getLongAnnotationSplit(List<Annotation> list, int i) {
        ArrayList newArrayList = CollectionHelper.newArrayList();
        for (Annotation annotation : list) {
            String[] split = annotation.getValue().split("\\s");
            if (split.length >= i) {
                ArrayList newArrayList2 = CollectionHelper.newArrayList();
                for (String str : split) {
                    if (getLowercaseRatio(str) < LOWERCASE_THRESHOLD) {
                        newArrayList2.add(str);
                    } else if (newArrayList2.size() > 0) {
                        String join = StringUtils.join(newArrayList2, Strings.SINGLE_SPACE_STRING);
                        if (join.length() > 1) {
                            newArrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + annotation.getValue().indexOf(join), join, SPLIT_ANNOTATION_TAG));
                        }
                        newArrayList2.clear();
                    }
                }
                if (newArrayList2.size() > 0) {
                    String join2 = StringUtils.join(newArrayList2, Strings.SINGLE_SPACE_STRING);
                    if (join2.length() > 1) {
                        newArrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + annotation.getValue().indexOf(join2), join2, SPLIT_ANNOTATION_TAG));
                    }
                }
            }
            String normalizeQuotes = StringHelper.normalizeQuotes(annotation.getValue());
            if (normalizeQuotes.contains("-")) {
                for (String str2 : normalizeQuotes.split("-")) {
                    if (StringHelper.startsUppercase(str2)) {
                        newArrayList.add(new ImmutableAnnotation(annotation.getStartPosition() + annotation.getValue().indexOf(str2), str2, SPLIT_ANNOTATION_TAG));
                    }
                }
            }
        }
        return newArrayList;
    }

    private static Set<String> getInSentenceCandidates(List<ContextAnnotation> list) {
        HashSet newHashSet = CollectionHelper.newHashSet();
        for (ContextAnnotation contextAnnotation : list) {
            if (isWithinSentence(contextAnnotation)) {
                String value = contextAnnotation.getValue();
                LOGGER.trace("Add '{}' to in-sentence candidates ({})", value, contextAnnotation.getLeftContext());
                newHashSet.add(value);
            }
        }
        return newHashSet;
    }

    private static boolean isWithinSentence(ContextAnnotation contextAnnotation) {
        return contextAnnotation.getLeftContext().matches(".*[A-Za-z0-9,]+\\s");
    }

    private double getLowercaseRatio(String str) {
        Double d = this.caseDictionary.get(str.toLowerCase());
        return d == null ? JXLabel.NORMAL : d.doubleValue();
    }

    public String correctCapitalization(String str) {
        String[] split = str.split("\\s");
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < split.length; i++) {
            String str2 = split[i];
            if (i > 0) {
                sb.append(Strings.SINGLE_SPACE_STRING);
            }
            String str3 = str2;
            if (i == split.length - 1 && str2.endsWith(".")) {
                str3 = str2.substring(0, str2.length() - 1);
            }
            if (i > 0 && getLowercaseRatio(str3) > LOWERCASE_THRESHOLD) {
                str2 = str2.toLowerCase();
            }
            sb.append(str2);
        }
        return sb.toString();
    }

    public static void main(String[] strArr) {
        CollectionHelper.print(new EntityPreprocessingTagger().getAnnotations(HtmlHelper.stripHtmlTags(FileHelper.readFileToString("/Users/pk/Desktop/LocationLab/TUD-Loc-2013_V1/text27.txt"))));
    }
}
