package ws.palladian.extraction.entity.tagger;

import com.aliasi.util.Strings;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.lang3.Validate;
import org.apache.commons.lang3.tuple.Pair;
import org.jdesktop.swingx.JXLabel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.CategoryEntries;
import ws.palladian.classification.CategoryEntriesMap;
import ws.palladian.classification.text.DictionaryModel;
import ws.palladian.classification.text.FeatureSetting;
import ws.palladian.classification.text.PalladianTextClassifier;
import ws.palladian.classification.utils.ClassificationUtils;
import ws.palladian.extraction.entity.Annotations;
import ws.palladian.extraction.entity.ContextAnnotation;
import ws.palladian.extraction.entity.DateAndTimeTagger;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.extraction.entity.StringTagger;
import ws.palladian.extraction.entity.TaggingFormat;
import ws.palladian.extraction.entity.TrainableNamedEntityRecognizer;
import ws.palladian.extraction.entity.UrlTagger;
import ws.palladian.extraction.entity.evaluation.EvaluationResult;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.ProgressHelper;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.collection.CountMatrix;
import ws.palladian.helper.constants.RegExp;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.processing.ClassifiedTextDocument;
import ws.palladian.processing.Trainable;
import ws.palladian.processing.features.Annotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/entity/tagger/PalladianNer.class */
public class PalladianNer extends TrainableNamedEntityRecognizer implements Serializable {
    private static final long serialVersionUID = -8793232373094322955L;
    private transient PalladianTextClassifier entityClassifier;
    private transient PalladianTextClassifier contextClassifier;
    private DictionaryModel entityDictionary;
    private List<String> conceptLikelihoodOrder;
    private DictionaryModel annotationModel;
    private DictionaryModel contextModel;
    private DictionaryModel caseDictionary;
    private CountMap<String> leftContextMap;
    private CountMatrix<String> patternProbabilityMatrix;
    private Set<String> removeAnnotations;
    private boolean removeDates;
    private boolean removeDateEntries;
    private boolean removeIncorrectlyTaggedInTraining;
    private boolean removeSentenceStartErrorsCaseDictionary;
    private boolean switchTagAnnotationsUsingPatterns;
    private boolean switchTagAnnotationsUsingDictionary;
    private boolean unwrapEntities;
    private boolean unwrapEntitiesWithContext;
    private final boolean retraining = true;
    private boolean tagUrls;
    private boolean tagDates;
    private static final String NO_ENTITY = "###NO_ENTITY###";
    private final LanguageMode languageMode;
    private final TrainingMode trainingMode;
    private static final Logger LOGGER = LoggerFactory.getLogger(PalladianNer.class);
    private static final LanguageMode DEFAULT_LANGUAGE_MODE = LanguageMode.English;
    private static final TrainingMode DEFAULT_TRAINING_MODE = TrainingMode.Complete;

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/entity/tagger/PalladianNer$LanguageMode.class */
    public enum LanguageMode {
        LanguageIndependent,
        English
    }

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/entity/tagger/PalladianNer$TrainingMode.class */
    public enum TrainingMode {
        Complete,
        Sparse
    }

    public PalladianNer(LanguageMode languageMode, TrainingMode trainingMode) {
        this.conceptLikelihoodOrder = new ArrayList();
        this.leftContextMap = CountMap.create();
        this.patternProbabilityMatrix = CountMatrix.create();
        this.removeAnnotations = CollectionHelper.newHashSet();
        this.removeDates = true;
        this.removeDateEntries = true;
        this.removeIncorrectlyTaggedInTraining = true;
        this.removeSentenceStartErrorsCaseDictionary = false;
        this.switchTagAnnotationsUsingPatterns = true;
        this.switchTagAnnotationsUsingDictionary = true;
        this.unwrapEntities = true;
        this.unwrapEntitiesWithContext = true;
        this.retraining = true;
        this.tagUrls = true;
        this.tagDates = true;
        Validate.notNull(languageMode, "languageMode must not be null", new Object[0]);
        Validate.notNull(trainingMode, "trainingMode must not be null", new Object[0]);
        this.entityDictionary = new DictionaryModel(null);
        this.caseDictionary = new DictionaryModel(null);
        this.entityClassifier = new PalladianTextClassifier(new FeatureSetting(FeatureSetting.TextFeatureType.CHAR_NGRAMS, 2, 8));
        this.contextClassifier = new PalladianTextClassifier(new FeatureSetting(FeatureSetting.TextFeatureType.CHAR_NGRAMS, 4, 5));
        this.conceptLikelihoodOrder = CollectionHelper.newArrayList();
        this.languageMode = languageMode;
        this.trainingMode = trainingMode;
    }

    public PalladianNer(LanguageMode languageMode) {
        this(languageMode, DEFAULT_TRAINING_MODE);
    }

    public PalladianNer(TrainingMode trainingMode) {
        this(DEFAULT_LANGUAGE_MODE, trainingMode);
    }

    public PalladianNer() {
        this(DEFAULT_LANGUAGE_MODE, DEFAULT_TRAINING_MODE);
    }

    public static String getModelFileEndingStatic() {
        return "model.gz";
    }

    @Override // ws.palladian.extraction.entity.TrainableNamedEntityRecognizer
    public String getModelFileEnding() {
        return getModelFileEndingStatic();
    }

    @Override // ws.palladian.extraction.entity.TrainableNamedEntityRecognizer
    public boolean setsModelFileEndingAutomatically() {
        return false;
    }

    @Override // ws.palladian.extraction.entity.TrainableNamedEntityRecognizer
    public boolean loadModel(String str) {
        StopWatch stopWatch = new StopWatch();
        if (!str.endsWith(getModelFileEnding())) {
            str = str + "." + getModelFileEnding();
        }
        this.entityDictionary = null;
        this.caseDictionary = null;
        this.leftContextMap = null;
        this.patternProbabilityMatrix = null;
        this.removeAnnotations = null;
        PalladianNer palladianNer = (PalladianNer) FileHelper.deserialize(str);
        this.entityDictionary = palladianNer.entityDictionary;
        this.conceptLikelihoodOrder = palladianNer.conceptLikelihoodOrder;
        this.annotationModel = palladianNer.annotationModel;
        this.caseDictionary = palladianNer.caseDictionary;
        this.leftContextMap = palladianNer.leftContextMap;
        this.contextModel = palladianNer.contextModel;
        this.patternProbabilityMatrix = palladianNer.patternProbabilityMatrix;
        this.removeAnnotations = palladianNer.removeAnnotations;
        this.removeDates = palladianNer.removeDates;
        this.removeDateEntries = palladianNer.removeDateEntries;
        this.removeIncorrectlyTaggedInTraining = palladianNer.removeIncorrectlyTaggedInTraining;
        this.removeSentenceStartErrorsCaseDictionary = palladianNer.removeSentenceStartErrorsCaseDictionary;
        this.switchTagAnnotationsUsingPatterns = palladianNer.switchTagAnnotationsUsingPatterns;
        this.switchTagAnnotationsUsingDictionary = palladianNer.switchTagAnnotationsUsingDictionary;
        this.unwrapEntities = palladianNer.unwrapEntities;
        this.unwrapEntitiesWithContext = palladianNer.unwrapEntitiesWithContext;
        this.tagDates = palladianNer.tagDates;
        this.tagUrls = palladianNer.tagUrls;
        LOGGER.info("model " + str + " successfully loaded in " + stopWatch.getElapsedTimeString());
        return true;
    }

    protected void saveModel(String str) {
        LOGGER.info("entity dictionary contains " + this.entityDictionary.getNumTerms() + " entities");
        LOGGER.info("case dictionary contains " + this.caseDictionary.getNumTerms() + " entities");
        LOGGER.info("serializing Palladian NER to " + str);
        if (!str.endsWith(getModelFileEnding())) {
            str = str + "." + getModelFileEnding();
        }
        FileHelper.serialize(this, str);
        LOGGER.info("dictionary size: " + this.annotationModel.getNumTerms());
        LOGGER.info("write model meta information");
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = this.annotationModel.getCategories().iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.writeToFile(FileHelper.getFilePath(str) + FileHelper.getFileName(str) + "_meta.txt", sb);
        LOGGER.info("all Palladian NER files written");
    }

    private void addToEntityDictionary(Annotation annotation) {
        addToEntityDictionary(annotation.getValue(), annotation.getTag());
    }

    private void addToEntityDictionary(String str, String str2) {
        this.entityDictionary.updateTerm(str, str2);
    }

    private void addToCaseDictionary(String str) {
        String trim = StringHelper.trim(str);
        if (trim.length() < 2) {
            return;
        }
        String caseSignature = StringHelper.getCaseSignature(trim);
        if (caseSignature.equals("Aa") || caseSignature.equals("A") || caseSignature.equals("a")) {
            this.caseDictionary.updateTerm(trim.toLowerCase(), caseSignature);
        }
    }

    @Override // ws.palladian.extraction.entity.TrainableNamedEntityRecognizer
    public boolean train(String str, String str2) {
        return train(str, new Annotations(), str2);
    }

    public boolean train(String str, List<? extends Annotation> list, String str2) {
        LOGGER.info("Start creating {} annotations for training", Integer.valueOf(list.size()));
        Iterator<? extends Annotation> it = list.iterator();
        while (it.hasNext()) {
            addToEntityDictionary(it.next());
        }
        return this.languageMode.equals(LanguageMode.English) ? trainEnglish(str, str2, list) : trainLanguageIndependent(str, str2);
    }

    public void setEntityDictionary(String str) {
        this.entityDictionary = new DictionaryModel(null);
        StopWatch stopWatch = new StopWatch();
        int i = 1;
        for (String str2 : FileHelper.readFileToArray(str)) {
            if (i == 1) {
                this.conceptLikelihoodOrder = CollectionHelper.newArrayList();
                this.conceptLikelihoodOrder.addAll(Arrays.asList(str2.split("\\>")));
                i++;
            } else {
                String[] split = str2.split("###");
                if (split.length >= 2) {
                    addToEntityDictionary(split[1], split[0]);
                    int i2 = i;
                    i++;
                    ProgressHelper.printProgress(i2, r0.size(), 1.0d, stopWatch);
                }
            }
        }
        LOGGER.info("Added {} entities to the dictionary in {}", Integer.valueOf(i - 2), stopWatch.getElapsedTimeString());
    }

    public boolean train(Annotations<ContextAnnotation> annotations, String str) {
        return trainLanguageIndependent(annotations, annotations, str);
    }

    public boolean trainLanguageIndependent(Annotations<ContextAnnotation> annotations, Annotations<ContextAnnotation> annotations2, String str) {
        ArrayList newArrayList = CollectionHelper.newArrayList();
        LOGGER.info("Start creating {} annotations for training", Integer.valueOf(annotations.size()));
        Iterator<T> it = annotations.iterator();
        while (it.hasNext()) {
            Annotation annotation = (Annotation) it.next();
            newArrayList.add(new ClassifiedTextDocument(annotation.getTag(), annotation.getValue()));
        }
        Iterator<T> it2 = annotations2.iterator();
        while (it2.hasNext()) {
            addToEntityDictionary((Annotation) it2.next());
        }
        trainAnnotationClassifier(newArrayList);
        saveModel(str);
        return true;
    }

    private void trainAnnotationClassifier(List<ClassifiedTextDocument> list) {
        LOGGER.info("start training classifiers now...");
        this.annotationModel = this.entityClassifier.train((Iterable<? extends Trainable>) list);
    }

    private boolean trainLanguageIndependent(String str, String str2, List<ContextAnnotation> list) {
        Annotations<ContextAnnotation> annotationsFromColumnTokenBased = FileFormatParser.getAnnotationsFromColumnTokenBased(str);
        Annotations<ContextAnnotation> annotationsFromColumn = FileFormatParser.getAnnotationsFromColumn(str);
        annotationsFromColumnTokenBased.addAll(list);
        annotationsFromColumn.addAll(list);
        analyzeContexts(str, annotationsFromColumnTokenBased);
        return trainLanguageIndependent(annotationsFromColumnTokenBased, annotationsFromColumn, str2);
    }

    private boolean trainLanguageIndependent(String str, String str2) {
        return trainLanguageIndependent(str, str2, Collections.emptyList());
    }

    private boolean trainEnglish(String str, String str2, List<? extends Annotation> list) {
        LOGGER.info("get annotations from column-formatted training file");
        Annotations<ContextAnnotation> annotationsFromColumn = FileFormatParser.getAnnotationsFromColumn(str);
        Iterator<? extends Annotation> it = list.iterator();
        while (it.hasNext()) {
            annotationsFromColumn.add((Annotations<ContextAnnotation>) new ContextAnnotation(it.next()));
        }
        List<ClassifiedTextDocument> newArrayList = CollectionHelper.newArrayList();
        LOGGER.info("add additional training annotations");
        int i = 1;
        Iterator<T> it2 = annotationsFromColumn.iterator();
        while (it2.hasNext()) {
            Annotation annotation = (ContextAnnotation) it2.next();
            newArrayList.add(new ClassifiedTextDocument(annotation.getTag(), annotation.getValue()));
            addToEntityDictionary(annotation);
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, annotationsFromColumn.size(), 1.0d);
        }
        LOGGER.info("add {} additional training annotations", Integer.valueOf(i));
        Iterator<String> it3 = Tokenizer.tokenize(FileFormatParser.getText(str, TaggingFormat.COLUMN)).iterator();
        while (it3.hasNext()) {
            addToCaseDictionary(it3.next());
        }
        LOGGER.info("start retraining (because of complete dataset, no sparse annotations)");
        trainAnnotationClassifier(newArrayList);
        saveModel(str2);
        this.removeAnnotations.clear();
        EvaluationResult evaluate = evaluate(str, TaggingFormat.COLUMN);
        Annotations<ContextAnnotation> annotations = FileFormatParser.getAnnotations(str, TaggingFormat.COLUMN);
        annotations.sort();
        for (Annotation annotation2 : evaluate.getAnnotations(EvaluationResult.ResultType.ERROR1)) {
            boolean z = true;
            String lowerCase = annotation2.getValue().toLowerCase();
            Iterator<T> it4 = annotations.iterator();
            while (true) {
                if (!it4.hasNext()) {
                    break;
                }
                if (lowerCase.equals(((Annotation) it4.next()).getValue().toLowerCase())) {
                    z = false;
                    break;
                }
            }
            newArrayList.add(new ClassifiedTextDocument(NO_ENTITY, annotation2.getValue()));
            if (z) {
                this.removeAnnotations.add(annotation2.getValue());
            }
        }
        LOGGER.info(this.removeAnnotations.size() + " annotations need to be completely removed");
        trainAnnotationClassifier(newArrayList);
        analyzeContexts(str, annotationsFromColumn);
        saveModel(str2);
        return true;
    }

    private boolean hasAssignedType(CategoryEntries categoryEntries) {
        String mostLikelyCategory = categoryEntries.getMostLikelyCategory();
        return (mostLikelyCategory == null || mostLikelyCategory.equalsIgnoreCase(NO_ENTITY)) ? false : true;
    }

    private Annotations<ContextAnnotation> classifyCandidatesEnglish(List<ContextAnnotation> list) {
        Annotations<ContextAnnotation> annotations = new Annotations<>();
        int i = 1;
        for (ContextAnnotation contextAnnotation : list) {
            Annotations<ContextAnnotation> annotations2 = new Annotations<>();
            if (this.unwrapEntities) {
                annotations2 = unwrapAnnotations(contextAnnotation, annotations);
            }
            if (annotations2.isEmpty()) {
                CategoryEntries classify = this.entityClassifier.classify(contextAnnotation.getValue(), this.annotationModel);
                if (hasAssignedType(classify)) {
                    contextAnnotation.setTags(classify);
                    annotations.add((Annotations<ContextAnnotation>) contextAnnotation);
                }
            } else {
                for (ContextAnnotation contextAnnotation2 : annotations2) {
                    if (hasAssignedType(contextAnnotation2.getTags())) {
                        annotations.add((Annotations<ContextAnnotation>) contextAnnotation2);
                    }
                }
            }
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, list.size(), 1.0d);
        }
        return annotations;
    }

    private Annotations<ContextAnnotation> classifyCandidatesLanguageIndependent(List<ContextAnnotation> list) {
        Annotations<ContextAnnotation> annotations = new Annotations<>();
        int i = 1;
        for (ContextAnnotation contextAnnotation : list) {
            CategoryEntries classify = this.entityClassifier.classify(contextAnnotation.getValue(), this.annotationModel);
            if (hasAssignedType(classify)) {
                contextAnnotation.setTags(classify);
                annotations.add((Annotations<ContextAnnotation>) contextAnnotation);
            }
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, list.size(), 1.0d);
        }
        return annotations;
    }

    @Override // ws.palladian.extraction.entity.NamedEntityRecognizer, ws.palladian.processing.Tagger
    public List<Annotation> getAnnotations(String str) {
        StopWatch stopWatch = new StopWatch();
        Annotations annotations = new Annotations();
        if (this.languageMode.equals(LanguageMode.English)) {
            annotations.addAll(getAnnotationsEnglish(str));
        } else {
            annotations.addAll(getAnnotationsLanguageIndependent(str));
        }
        if (isTagUrls()) {
            annotations.addAll(new UrlTagger().getAnnotations(str));
            annotations.removeNested();
        }
        if (isTagDates()) {
            annotations.addAll(new DateAndTimeTagger().getAnnotations(str));
            annotations.removeNested();
        }
        annotations.removeNested();
        annotations.sort();
        LOGGER.info("Got {} annotations in {}", Integer.valueOf(annotations.size()), stopWatch.getElapsedTimeString());
        return annotations;
    }

    private void postProcessAnnotations(List<ContextAnnotation> list) {
        LOGGER.debug("start post processing annotations");
        StopWatch stopWatch = new StopWatch();
        Annotations annotations = new Annotations();
        if (this.removeDates) {
            stopWatch.start();
            int i = 0;
            for (ContextAnnotation contextAnnotation : list) {
                if (containsDateFragment(contextAnnotation.getValue())) {
                    annotations.add((Annotations) contextAnnotation);
                    i++;
                }
            }
            LOGGER.debug("removed " + i + " purely date annotations in " + stopWatch.getElapsedTimeString());
        }
        if (this.removeDateEntries) {
            stopWatch.start();
            int i2 = 0;
            for (ContextAnnotation contextAnnotation2 : list) {
                Pair<String, Integer> removeDateFragment = removeDateFragment(contextAnnotation2.getValue());
                contextAnnotation2.setValue(removeDateFragment.getLeft());
                contextAnnotation2.setStartPosition(contextAnnotation2.getStartPosition() + removeDateFragment.getRight().intValue());
                if (removeDateFragment.getRight().intValue() > 0) {
                    i2++;
                }
            }
            LOGGER.debug("removed " + i2 + " partial date annotations in " + stopWatch.getElapsedTimeString());
        }
        if (this.removeIncorrectlyTaggedInTraining) {
            stopWatch.start();
            Iterator<String> it = this.removeAnnotations.iterator();
            while (it.hasNext()) {
                String lowerCase = it.next().toLowerCase();
                for (ContextAnnotation contextAnnotation3 : list) {
                    if (lowerCase.equals(contextAnnotation3.getValue().toLowerCase())) {
                        annotations.add((Annotations) contextAnnotation3);
                    }
                }
            }
            LOGGER.debug("removed " + this.removeAnnotations.size() + " incorrectly tagged entities in training data in " + stopWatch.getElapsedTimeString());
        }
        int i3 = 0;
        if (this.removeSentenceStartErrorsCaseDictionary) {
            stopWatch.start();
            for (ContextAnnotation contextAnnotation4 : list) {
                if (contextAnnotation4.getValue().indexOf(Strings.SINGLE_SPACE_STRING) == -1) {
                    CategoryEntries categoryEntries = this.caseDictionary.getCategoryEntries(contextAnnotation4.getValue().toLowerCase());
                    if (categoryEntries != null && categoryEntries.iterator().hasNext()) {
                        double probability = categoryEntries.getProbability("A") > JXLabel.NORMAL ? categoryEntries.getProbability("A") : 0.0d;
                        double probability2 = categoryEntries.getProbability("Aa") > JXLabel.NORMAL ? categoryEntries.getProbability("Aa") : 0.0d;
                        double probability3 = categoryEntries.getProbability("a") > JXLabel.NORMAL ? categoryEntries.getProbability("a") : 0.0d;
                        r16 = probability3 > JXLabel.NORMAL ? probability2 / probability3 : 2.0d;
                        if (probability > probability2 && probability > probability3) {
                            r16 = 2.0d;
                        }
                    }
                    if (r16 <= 1.0d) {
                        i3++;
                        annotations.add((Annotations) contextAnnotation4);
                        LOGGER.debug("remove word using the case signature: " + contextAnnotation4.getValue() + " (ratio:" + r16 + ") | " + contextAnnotation4.getRightContext());
                    }
                }
            }
            LOGGER.debug("removed " + i3 + " words at beginning of sentence in " + stopWatch.getElapsedTimeString());
        }
        LOGGER.debug("remove " + annotations.size() + " entities");
        list.removeAll(annotations);
        int i4 = 0;
        if (this.switchTagAnnotationsUsingPatterns) {
            stopWatch.start();
            for (ContextAnnotation contextAnnotation5 : list) {
                String tag = contextAnnotation5.getTag();
                applyContextAnalysis(contextAnnotation5);
                if (!contextAnnotation5.getTag().equalsIgnoreCase(tag)) {
                    LOGGER.debug("changed " + contextAnnotation5.getValue() + " from " + tag + " to " + contextAnnotation5.getTag() + ", left context: " + contextAnnotation5.getLeftContext() + "____" + contextAnnotation5.getRightContext());
                    i4++;
                }
            }
            LOGGER.debug("changed " + MathHelper.round((100 * i4) / (list.size() + 1.0E-12d), 2) + "% of the entities using patterns in " + stopWatch.getElapsedTimeString());
        }
        int i5 = 0;
        if (this.switchTagAnnotationsUsingDictionary) {
            stopWatch.start();
            for (ContextAnnotation contextAnnotation6 : list) {
                CategoryEntries categoryEntries2 = this.entityDictionary.getCategoryEntries(contextAnnotation6.getValue());
                if (categoryEntries2 != null && categoryEntries2.iterator().hasNext()) {
                    CategoryEntriesMap categoryEntriesMap = new CategoryEntriesMap();
                    if (this.conceptLikelihoodOrder != null) {
                        Iterator<String> it2 = this.conceptLikelihoodOrder.iterator();
                        while (true) {
                            if (!it2.hasNext()) {
                                break;
                            }
                            String next = it2.next();
                            for (String str : categoryEntries2) {
                                if (categoryEntries2.getProbability(str) > JXLabel.NORMAL && str.equalsIgnoreCase(next)) {
                                    categoryEntriesMap.set(str, categoryEntries2.getProbability(str));
                                    break;
                                }
                            }
                        }
                        if (categoryEntriesMap.iterator().hasNext()) {
                            categoryEntries2 = categoryEntriesMap;
                        }
                    }
                    contextAnnotation6.setTags(categoryEntries2);
                    i5++;
                }
            }
            LOGGER.debug("changed with entity dictionary " + MathHelper.round((100 * i5) / (list.size() + 1.0E-12d), 2) + "% of the entities (total entities: " + list.size() + ") in " + stopWatch.getElapsedTimeString());
        }
        Annotations annotations2 = new Annotations();
        stopWatch.start();
        LinkedHashMap<String, Integer> sortedMapDescending = this.leftContextMap.getSortedMapDescending();
        for (ContextAnnotation contextAnnotation7 : list) {
            if (contextAnnotation7.getValue().toLowerCase().indexOf("docstart") > -1) {
                annotations.add((Annotations) contextAnnotation7);
            } else {
                if (this.unwrapEntities) {
                    Annotations<ContextAnnotation> unwrapAnnotations = unwrapAnnotations(contextAnnotation7, list);
                    if (!unwrapAnnotations.isEmpty()) {
                        Iterator<T> it3 = unwrapAnnotations.iterator();
                        while (it3.hasNext()) {
                            ContextAnnotation contextAnnotation8 = (ContextAnnotation) it3.next();
                            if (hasAssignedType(contextAnnotation8.getTags())) {
                                annotations2.add((Annotations) contextAnnotation8);
                            }
                        }
                        String str2 = "tried to unwrap again " + contextAnnotation7.getValue();
                        Iterator<T> it4 = unwrapAnnotations.iterator();
                        while (it4.hasNext()) {
                            str2 = str2 + " | " + ((ContextAnnotation) it4.next()).getValue();
                        }
                        LOGGER.debug(str2);
                    }
                }
                if (this.unwrapEntitiesWithContext) {
                    for (Map.Entry<String, Integer> entry : sortedMapDescending.entrySet()) {
                        String key = entry.getKey();
                        if (entry.getValue().intValue() != 0) {
                            if (StringHelper.startsUppercase(key)) {
                                String value = contextAnnotation7.getValue();
                                int indexOf = value.indexOf(key + Strings.SINGLE_SPACE_STRING);
                                int indexOf2 = value.indexOf(Strings.SINGLE_SPACE_STRING + key + Strings.SINGLE_SPACE_STRING);
                                int i6 = -1;
                                int i7 = -1;
                                if (indexOf == 0) {
                                    i6 = key.length() + 1;
                                    i7 = indexOf;
                                } else if (indexOf2 > -1) {
                                    i6 = key.length() + 2;
                                    i7 = indexOf2;
                                }
                                if (indexOf == 0 || indexOf2 > -1) {
                                    ContextAnnotation contextAnnotation9 = new ContextAnnotation(contextAnnotation7.getStartPosition() + i7 + i6, contextAnnotation7.getValue().substring(i7 + i6), contextAnnotation7.getTag());
                                    annotations2.add((Annotations) contextAnnotation9);
                                    Iterator<String> it5 = this.entityDictionary.getTerms().iterator();
                                    while (true) {
                                        if (!it5.hasNext()) {
                                            break;
                                        }
                                        String next2 = it5.next();
                                        int indexOf3 = contextAnnotation7.getValue().substring(0, i7 + i6).indexOf(next2 + Strings.SINGLE_SPACE_STRING);
                                        if (indexOf3 > -1 && next2.length() > 2) {
                                            ContextAnnotation contextAnnotation10 = new ContextAnnotation(contextAnnotation7.getStartPosition() + indexOf3, next2, this.entityDictionary.getCategoryEntries(next2).getMostLikelyCategory());
                                            annotations2.add((Annotations) contextAnnotation10);
                                            LOGGER.debug("add from prefix " + contextAnnotation10.getValue());
                                            break;
                                        }
                                    }
                                    annotations.add((Annotations) contextAnnotation7);
                                    LOGGER.debug("add " + contextAnnotation9.getValue() + ", delete " + contextAnnotation7.getValue() + " (left context:" + key + ", " + entry.getValue() + ")");
                                }
                            }
                        }
                    }
                }
            }
        }
        LOGGER.debug("Unwrapped entities in {}", stopWatch.getElapsedTimeString());
        LOGGER.debug("Add {} entities", Integer.valueOf(annotations2.size()));
        list.addAll(annotations2);
        LOGGER.debug("Remove {} entities", Integer.valueOf(annotations.size()));
        list.removeAll(annotations);
    }

    public Annotations<ContextAnnotation> getAnnotationsEnglish(String str) {
        Annotations<ContextAnnotation> classifyCandidatesEnglish = classifyCandidatesEnglish(StringTagger.getTaggedEntities(str));
        postProcessAnnotations(classifyCandidatesEnglish);
        return classifyCandidatesEnglish;
    }

    public Annotations<ContextAnnotation> getAnnotationsLanguageIndependent(String str) {
        this.removeDates = false;
        this.removeDateEntries = false;
        this.removeIncorrectlyTaggedInTraining = false;
        this.switchTagAnnotationsUsingPatterns = false;
        this.switchTagAnnotationsUsingDictionary = true;
        this.unwrapEntities = false;
        this.unwrapEntitiesWithContext = false;
        Annotations<ContextAnnotation> classifyCandidatesLanguageIndependent = classifyCandidatesLanguageIndependent(StringTagger.getTaggedEntities(str, Tokenizer.TOKEN_SPLIT_REGEX));
        postProcessAnnotations(classifyCandidatesLanguageIndependent);
        Annotations annotations = new Annotations();
        classifyCandidatesLanguageIndependent.sort();
        ContextAnnotation contextAnnotation = null;
        int i = -2;
        String str2 = "";
        ContextAnnotation contextAnnotation2 = null;
        Iterator<T> it = classifyCandidatesLanguageIndependent.iterator();
        while (it.hasNext()) {
            ContextAnnotation contextAnnotation3 = (ContextAnnotation) it.next();
            if (!contextAnnotation3.getTag().equalsIgnoreCase("o") && contextAnnotation3.getTag().equalsIgnoreCase(str2) && contextAnnotation3.getStartPosition() == i + 1) {
                if (contextAnnotation2 == null) {
                    contextAnnotation2 = contextAnnotation;
                }
                ContextAnnotation contextAnnotation4 = new ContextAnnotation(contextAnnotation2.getStartPosition(), contextAnnotation2.getValue() + Strings.SINGLE_SPACE_STRING + contextAnnotation3.getValue(), contextAnnotation3.getTag());
                annotations.add((Annotations) contextAnnotation4);
                contextAnnotation2 = contextAnnotation4;
                annotations.remove(contextAnnotation2);
            } else {
                annotations.add((Annotations) contextAnnotation3);
                contextAnnotation2 = null;
            }
            contextAnnotation = contextAnnotation3;
            i = contextAnnotation3.getEndPosition();
            str2 = contextAnnotation3.getTag();
        }
        Annotations<ContextAnnotation> annotations2 = new Annotations<>();
        Iterator<T> it2 = annotations.iterator();
        while (it2.hasNext()) {
            ContextAnnotation contextAnnotation5 = (ContextAnnotation) it2.next();
            if (!contextAnnotation5.getTag().equalsIgnoreCase("o") && contextAnnotation5.getValue().length() > 1) {
                annotations2.add((Annotations<ContextAnnotation>) contextAnnotation5);
            }
        }
        return annotations2;
    }

    private void applyContextAnalysis(ContextAnnotation contextAnnotation) {
        String[] leftContexts = contextAnnotation.getLeftContexts();
        String[] rightContexts = contextAnnotation.getRightContexts();
        ArrayList arrayList = new ArrayList();
        for (String str : leftContexts) {
            arrayList.add(str);
        }
        for (String str2 : rightContexts) {
            arrayList.add(str2);
        }
        HashMap hashMap = new HashMap();
        Iterator<String> it = this.patternProbabilityMatrix.getKeysX().iterator();
        while (it.hasNext()) {
            hashMap.put(it.next(), Double.valueOf(JXLabel.NORMAL));
        }
        Iterator it2 = arrayList.iterator();
        while (it2.hasNext()) {
            String lowerCase = ((String) it2.next()).toLowerCase();
            if (lowerCase.length() != 0) {
                CountMap create = CountMap.create();
                int i = 0;
                for (String str3 : this.patternProbabilityMatrix.getKeysX()) {
                    Integer num = this.patternProbabilityMatrix.get(str3, lowerCase);
                    if (num == null) {
                        create.set(str3, 0);
                    } else {
                        create.add(str3, num.intValue());
                        i += num.intValue();
                    }
                }
                if (i != 0) {
                    for (String str4 : this.patternProbabilityMatrix.getKeysX()) {
                        hashMap.put(str4, Double.valueOf(((Double) hashMap.get(str4)).doubleValue() + (create.getCount(str4) / i)));
                    }
                }
            }
        }
        CategoryEntriesMap categoryEntriesMap = new CategoryEntriesMap();
        double d = 0.0d;
        Iterator<String> it3 = this.patternProbabilityMatrix.getKeysX().iterator();
        while (it3.hasNext()) {
            d += ((Double) hashMap.get(it3.next())).doubleValue();
        }
        if (d == JXLabel.NORMAL) {
            d = 1.0d;
        }
        for (String str5 : this.patternProbabilityMatrix.getKeysX()) {
            categoryEntriesMap.set(str5, ((Double) hashMap.get(str5)).doubleValue() / d);
        }
        contextAnnotation.setTags(CategoryEntriesMap.merge(categoryEntriesMap, contextAnnotation.getTags()));
    }

    private boolean containsDateFragment(String str) {
        for (String str2 : RegExp.DATE_FRAGMENTS) {
            if (str.toLowerCase().replaceAll(str2.toLowerCase(), "").trim().isEmpty()) {
                return true;
            }
        }
        return false;
    }

    private Pair<String, Integer> removeDateFragment(String str) {
        int i = 0;
        for (String str2 : RegExp.DATE_FRAGMENTS) {
            String str3 = "(?:" + str2 + ")";
            int length = str.length();
            if (StringHelper.countRegexMatches(str, "^" + str3 + Strings.SINGLE_SPACE_STRING) > 0) {
                str = str.replaceAll("^" + str3 + Strings.SINGLE_SPACE_STRING, "").trim();
                i += length - str.length();
            }
            if (StringHelper.countRegexMatches(str, Strings.SINGLE_SPACE_STRING + str3 + "$") > 0) {
                str = str.replaceAll(Strings.SINGLE_SPACE_STRING + str3 + "$", "").trim();
            }
            if (StringHelper.countRegexMatches(str, "^" + str3 + "\\. ") > 0) {
                str = str.replaceAll("^" + str3 + "\\. ", "").trim();
                i += length - str.length();
            }
            if (StringHelper.countRegexMatches(str, Strings.SINGLE_SPACE_STRING + str3 + "\\.$") > 0) {
                str = str.replaceAll(Strings.SINGLE_SPACE_STRING + str3 + "\\.$", "").trim();
            }
        }
        return Pair.of(str, Integer.valueOf(i));
    }

    private void analyzeContexts(String str, List<? extends Annotation> list) {
        LOGGER.debug("start analyzing contexts");
        TreeMap treeMap = new TreeMap();
        CountMap create = CountMap.create();
        this.leftContextMap = CountMap.create();
        CountMap create2 = CountMap.create();
        Annotations<ContextAnnotation> annotationsFromColumn = FileFormatParser.getAnnotationsFromColumn(str);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        int i = 1;
        Iterator<T> it = annotationsFromColumn.iterator();
        while (it.hasNext()) {
            ContextAnnotation contextAnnotation = (ContextAnnotation) it.next();
            String tag = contextAnnotation.getTag();
            String[] leftContexts = contextAnnotation.getLeftContexts();
            String[] rightContexts = contextAnnotation.getRightContexts();
            if (treeMap.get(tag) == null) {
                treeMap.put(tag, CountMap.create());
            }
            ((CountMap) treeMap.get(tag)).add(leftContexts[0]);
            ((CountMap) treeMap.get(tag)).add(leftContexts[1]);
            ((CountMap) treeMap.get(tag)).add(leftContexts[2]);
            create.add(leftContexts[0]);
            create.add(leftContexts[1]);
            create.add(leftContexts[2]);
            ((CountMap) treeMap.get(tag)).add(rightContexts[0]);
            ((CountMap) treeMap.get(tag)).add(rightContexts[1]);
            ((CountMap) treeMap.get(tag)).add(rightContexts[2]);
            create2.add(tag);
            newArrayList.add(new ClassifiedTextDocument(tag, contextAnnotation.getLeftContext() + "__" + contextAnnotation.getRightContext()));
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, annotationsFromColumn.size(), 1.0d);
        }
        for (String str2 : create.uniqueItems()) {
            int count = create.getCount(str2);
            int i3 = 0;
            Iterator<T> it2 = annotationsFromColumn.iterator();
            while (it2.hasNext()) {
                ContextAnnotation contextAnnotation2 = (ContextAnnotation) it2.next();
                if (contextAnnotation2.getValue().startsWith(str2 + Strings.SINGLE_SPACE_STRING) || contextAnnotation2.getValue().equals(str2)) {
                    i3++;
                }
            }
            if (i3 / count >= 1.0d || count < 2) {
                this.leftContextMap.add(str2, 0);
            } else {
                this.leftContextMap.add(str2, 1);
            }
        }
        trainContextClassifier(newArrayList);
        StringBuilder sb = new StringBuilder();
        for (Map.Entry entry : treeMap.entrySet()) {
            int count2 = create2.getCount(entry.getKey());
            LinkedHashMap sortedMap = ((CountMap) treeMap.get(entry.getKey())).getSortedMap();
            sb.append((String) entry.getKey()).append("###").append(count2).append(FileHelper.NEWLINE_CHARACTER);
            for (Map.Entry entry2 : sortedMap.entrySet()) {
                if (((Integer) entry2.getValue()).intValue() > 0) {
                    sb.append((String) entry2.getKey()).append("###").append(entry2.getValue()).append(FileHelper.NEWLINE_CHARACTER);
                }
            }
            sb.append("++++++++++++++++++++++++++++++++++\n\n");
        }
        for (Map.Entry entry3 : treeMap.entrySet()) {
            for (String str3 : ((CountMap) entry3.getValue()).uniqueItems()) {
                this.patternProbabilityMatrix.set(entry3.getKey(), (Object) str3.toLowerCase(), Integer.valueOf(((CountMap) entry3.getValue()).getCount(str3)));
            }
        }
    }

    private void trainContextClassifier(List<ClassifiedTextDocument> list) {
        this.contextModel = this.contextClassifier.train((Iterable<? extends Trainable>) list);
    }

    public LanguageMode getLanguageMode() {
        return this.languageMode;
    }

    public TrainingMode getTrainingMode() {
        return this.trainingMode;
    }

    public DictionaryModel getEntityDictionary() {
        return this.entityDictionary;
    }

    DictionaryModel getCaseDictionary() {
        return this.caseDictionary;
    }

    CountMap<String> getLeftContextMap() {
        return this.leftContextMap;
    }

    Set<String> getRemoveAnnotations() {
        return this.removeAnnotations;
    }

    DictionaryModel getContextModel() {
        return this.contextModel;
    }

    DictionaryModel getAnnotationModel() {
        return this.annotationModel;
    }

    public void setTagUrls(boolean z) {
        this.tagUrls = z;
    }

    public boolean isTagUrls() {
        return this.tagUrls;
    }

    public void setTagDates(boolean z) {
        this.tagDates = z;
    }

    public boolean isTagDates() {
        return this.tagDates;
    }

    private Annotations<ContextAnnotation> unwrapAnnotations(Annotation annotation, List<ContextAnnotation> list) {
        Annotations<ContextAnnotation> annotations = new Annotations<>();
        if (!StringHelper.isCompletelyUppercase(annotation.getValue())) {
            return annotations;
        }
        String lowerCase = annotation.getValue().toLowerCase();
        int length = lowerCase.length();
        for (ContextAnnotation contextAnnotation : list) {
            if (contextAnnotation.getValue().length() < length) {
                int indexOf = lowerCase.indexOf(Strings.SINGLE_SPACE_STRING + contextAnnotation.getValue().toLowerCase() + Strings.SINGLE_SPACE_STRING);
                if (indexOf > -1 && contextAnnotation.getValue().length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf + 1, contextAnnotation.getValue(), contextAnnotation.getTag()));
                }
                int indexOf2 = lowerCase.indexOf(contextAnnotation.getValue().toLowerCase() + Strings.SINGLE_SPACE_STRING);
                if (indexOf2 == 0 && contextAnnotation.getValue().length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf2, contextAnnotation.getValue(), contextAnnotation.getTag()));
                }
                int indexOf3 = lowerCase.indexOf(Strings.SINGLE_SPACE_STRING + contextAnnotation.getValue().toLowerCase());
                if (indexOf3 == (lowerCase.length() - contextAnnotation.getValue().length()) - 1 && contextAnnotation.getValue().length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf3 + 1, contextAnnotation.getValue(), contextAnnotation.getTag()));
                }
            }
        }
        for (String str : this.entityDictionary.getTerms()) {
            if (str.length() < length) {
                int indexOf4 = lowerCase.indexOf(Strings.SINGLE_SPACE_STRING + str.toLowerCase() + Strings.SINGLE_SPACE_STRING);
                String mostLikelyCategory = this.entityDictionary.getCategoryEntries(str).getMostLikelyCategory();
                if (indexOf4 > -1 && str.length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf4 + 1, str, mostLikelyCategory));
                }
                int indexOf5 = lowerCase.indexOf(str.toLowerCase() + Strings.SINGLE_SPACE_STRING);
                if (indexOf5 == 0 && str.length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf5, str, mostLikelyCategory));
                }
                int indexOf6 = lowerCase.indexOf(Strings.SINGLE_SPACE_STRING + str.toLowerCase());
                if (indexOf6 == (lowerCase.length() - str.length()) - 1 && str.length() > 2) {
                    annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation(annotation.getStartPosition() + indexOf6 + 1, str, mostLikelyCategory));
                }
            }
        }
        return annotations;
    }

    @Override // ws.palladian.extraction.entity.NamedEntityRecognizer
    public String getName() {
        return "Palladian NER (" + this.languageMode + "," + this.trainingMode + ")";
    }

    public static void main(String[] strArr) {
        new PalladianNer();
        PalladianNer palladianNer = new PalladianNer(LanguageMode.English, TrainingMode.Complete);
        palladianNer.setTagDates(false);
        palladianNer.setTagUrls(false);
        FileFormatParser.getSeedAnnotations(PalladianNer.class.getResource("/nerSeeds.txt").getFile(), -1);
        palladianNer.train("data/datasets/ner/tud/tud2011_train.txt", "data/temp/palladianNerTudCs4");
        System.exit(0);
        palladianNer.loadModel("data/temp/palladianNerTudCs4");
        System.out.println(palladianNer.tag("Peter J. Johnson lives in New York City in the U.S.A."));
        CollectionHelper.print(palladianNer.getAnnotations("Peter J. Johnson lives in New York City in the U.S.A."));
        System.exit(0);
        EvaluationResult evaluate = palladianNer.evaluate("data/datasets/ner/tud/tud2011_test.txt", TaggingFormat.COLUMN);
        System.out.println(evaluate.getMUCResultsReadable());
        System.out.println(evaluate.getExactMatchResultsReadable());
        System.exit(0);
        String str = "data/temp/autoGeneratedDataConll/seedsTest1.txt";
        StopWatch stopWatch = new StopWatch();
        StringBuilder sb = new StringBuilder();
        HashSet hashSet = new HashSet();
        Iterator<T> it = FileFormatParser.getSeedAnnotations(str, -1).iterator();
        while (it.hasNext()) {
            hashSet.add(((ContextAnnotation) it.next()).getValue());
        }
        for (int i = 1; i <= 5; i++) {
            int i2 = i;
            if (i2 > 1) {
                int i3 = i2 * 10;
            }
            str = "data/temp/autoGeneratedDataTUD/newDataset10.txt";
            PalladianNer palladianNer2 = new PalladianNer(LanguageMode.English, TrainingMode.Sparse);
            palladianNer2.train(str, "data/temp/tudner");
            palladianNer2.loadModel("data/temp/tudner");
            EvaluationResult evaluate2 = palladianNer2.evaluate("data/datasets/ner/tud/tud2011_test.txt", TaggingFormat.COLUMN, hashSet);
            sb.append(evaluate2.getPrecision(EvaluationResult.EvaluationMode.EXACT_MATCH)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(evaluate2.getRecall(EvaluationResult.EvaluationMode.EXACT_MATCH)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(evaluate2.getF1(EvaluationResult.EvaluationMode.EXACT_MATCH)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(evaluate2.getPrecision(EvaluationResult.EvaluationMode.MUC)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(evaluate2.getRecall(EvaluationResult.EvaluationMode.MUC)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(evaluate2.getF1(EvaluationResult.EvaluationMode.MUC)).append(ClassificationUtils.DEFAULT_SEPARATOR);
            sb.append(FileHelper.NEWLINE_CHARACTER);
            FileHelper.writeToFile("results.txt", sb);
        }
        System.exit(0);
        PalladianNer palladianNer3 = new PalladianNer(LanguageMode.English, TrainingMode.Complete);
        FileFormatParser.getSeedAnnotations("data/datasets/ner/tud/manuallyPickedSeeds/seedListC.txt", 50);
        palladianNer3.train(str, "data/temp/tudner");
        palladianNer3.loadModel("data/temp/tudner");
        EvaluationResult evaluate3 = palladianNer3.evaluate("data/datasets/ner/tud/tud2011_test.txt", TaggingFormat.COLUMN);
        System.out.println(evaluate3.getMUCResultsReadable());
        System.out.println(evaluate3.getExactMatchResultsReadable());
        System.out.println(stopWatch.getElapsedTimeString());
    }
}
