package ws.palladian.extraction.pos;

import com.aliasi.corpus.DiskCorpus;
import edu.smu.tspell.wordnet.impl.file.SenseKey;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.classification.Instance;
import ws.palladian.classification.text.FeatureSetting;
import ws.palladian.classification.text.PreprocessingPipeline;
import ws.palladian.classification.universal.UniversalClassifier;
import ws.palladian.classification.universal.UniversalClassifierModel;
import ws.palladian.helper.Cache;
import ws.palladian.helper.ProgressHelper;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.ConfusionMatrix;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.processing.Classifiable;
import ws.palladian.processing.DocumentUnprocessableException;
import ws.palladian.processing.TextDocument;
import ws.palladian.processing.Trainable;
import ws.palladian.processing.features.Feature;
import ws.palladian.processing.features.NominalFeature;
import ws.palladian.processing.features.PositionAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/pos/PalladianPosTagger.class */
public class PalladianPosTagger extends BasePosTagger {
    private static final Logger LOGGER = LoggerFactory.getLogger(PalladianPosTagger.class);
    private static final String TAGGER_NAME = "Palladian POS-Tagger";
    private UniversalClassifier tagger;
    private UniversalClassifierModel model;

    public PalladianPosTagger(String str) {
        this.model = (UniversalClassifierModel) Cache.getInstance().getDataObject(str);
        if (this.model == null) {
            this.model = (UniversalClassifierModel) FileHelper.deserialize(str);
            Cache.getInstance().putDataObject(str, this.model);
        }
        this.tagger = getTagger();
    }

    public PalladianPosTagger() {
        this.tagger = getTagger();
    }

    @Override // ws.palladian.extraction.pos.BasePosTagger
    public void tag(List<PositionAnnotation> list) {
        String str = "";
        for (PositionAnnotation positionAnnotation : list) {
            Instance instance = new Instance(DiskCorpus.DEFAULT_TEST_DIR_NAME);
            setFeatures(instance, str, positionAnnotation.getValue());
            String mostLikelyCategory = this.tagger.classify((Classifiable) instance.getFeatureVector(), this.model).getMostLikelyCategory();
            assignTag(positionAnnotation, Arrays.asList(mostLikelyCategory));
            str = mostLikelyCategory;
        }
    }

    private UniversalClassifier getTagger() {
        return new UniversalClassifier(EnumSet.of(UniversalClassifier.ClassifierSetting.TEXT, UniversalClassifier.ClassifierSetting.NOMINAL), new FeatureSetting(FeatureSetting.TextFeatureType.CHAR_NGRAMS, 1, 7));
    }

    public void trainModel(String str, String str2) {
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("start training the tagger");
        ArrayList newArrayList = CollectionHelper.newArrayList();
        int i = 1;
        for (File file : FileHelper.getFiles(str)) {
            String str3 = "";
            for (String str4 : FileHelper.readFileToString(file).split("\\s")) {
                if (!str4.isEmpty()) {
                    String[] split = str4.split(CoreLabel.TAG_SEPARATOR);
                    String str5 = split[0];
                    if (split.length >= 2 && !str5.isEmpty()) {
                        Instance instance = new Instance(normalizeTag(split[1]));
                        setFeatures(instance, str3, split[0]);
                        newArrayList.add(instance);
                        str3 = split[1];
                    }
                }
            }
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, r0.length, 1.0d);
        }
        LOGGER.info("all files read in " + stopWatch.getElapsedTimeString());
        this.model = this.tagger.train((Iterable<? extends Trainable>) newArrayList);
        FileHelper.serialize(this.model, str2);
        Cache.getInstance().putDataObject(str2, this.model);
        LOGGER.info("finished training tagger in " + stopWatch.getElapsedTimeString());
    }

    private void setFeatures(Instance instance, String str, String str2) {
        String substring = str2.length() > 1 ? str2.substring(str2.length() - 2) : "";
        String[] strArr = new String[12];
        strArr[0] = String.valueOf(StringHelper.startsUppercase(str2));
        strArr[1] = String.valueOf(str2.length() == 1);
        strArr[2] = String.valueOf(str2.length() == 2);
        strArr[3] = String.valueOf(str2.length() == 3);
        strArr[4] = String.valueOf(str2.length());
        strArr[5] = String.valueOf(StringHelper.isNumberOrNumberWord(str2));
        strArr[6] = String.valueOf(StringHelper.isCompletelyUppercase(str2));
        strArr[7] = String.valueOf(str2.replaceAll("[^`'\",.:;*\\(\\)]", "").length());
        strArr[8] = str2.substring(str2.length() - 1);
        strArr[9] = str2.substring(0, 1);
        strArr[10] = substring;
        strArr[11] = str2;
        Iterator it = Arrays.asList(strArr).iterator();
        while (it.hasNext()) {
            instance.getFeatureVector().add(new NominalFeature(("nom" + instance.getFeatureVector().size()).intern(), (String) it.next()));
        }
        try {
            PreprocessingPipeline preprocessingPipeline = new PreprocessingPipeline(this.tagger.getFeatureSetting());
            TextDocument textDocument = new TextDocument(str2);
            preprocessingPipeline.process(textDocument);
            Iterator<Feature<?>> it2 = textDocument.getFeatureVector().iterator();
            while (it2.hasNext()) {
                instance.getFeatureVector().add(it2.next());
            }
        } catch (DocumentUnprocessableException e) {
            throw new IllegalStateException(e);
        }
    }

    public void evaluate(String str, String str2) {
        this.model = (UniversalClassifierModel) Cache.getInstance().getDataObject(str2, new File(str2));
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("start evaluating the tagger");
        ConfusionMatrix confusionMatrix = new ConfusionMatrix();
        int i = 1;
        int i2 = 0;
        int i3 = 0;
        for (File file : FileHelper.getFiles(str)) {
            String str3 = "";
            for (String str4 : FileHelper.readFileToString(file).split("\\s")) {
                if (!str4.isEmpty()) {
                    String[] split = str4.split(CoreLabel.TAG_SEPARATOR);
                    String str5 = split[0];
                    if (split.length >= 2 && !str5.isEmpty()) {
                        Instance instance = new Instance(DiskCorpus.DEFAULT_TEST_DIR_NAME);
                        setFeatures(instance, str3, split[0]);
                        String mostLikelyCategory = this.tagger.classify((Classifiable) instance.getFeatureVector(), this.model).getMostLikelyCategory();
                        String lowerCase = normalizeTag(split[1]).toLowerCase();
                        str3 = mostLikelyCategory;
                        confusionMatrix.add(lowerCase, mostLikelyCategory);
                        if (mostLikelyCategory.equals(lowerCase)) {
                            i2++;
                        }
                        i3++;
                    }
                }
            }
            int i4 = i;
            i++;
            ProgressHelper.printProgress(i4, r0.length, 1.0d);
        }
        LOGGER.info("all files read in " + stopWatch.getElapsedTimeString());
        LOGGER.info("Accuracy: " + MathHelper.round((100.0d * i2) / i3, 2) + SenseKey.LEMMA_TERMINATOR);
        LOGGER.info(FileHelper.NEWLINE_CHARACTER + confusionMatrix);
        LOGGER.info("finished evaluating the tagger in " + stopWatch.getElapsedTimeString());
    }

    @Override // ws.palladian.extraction.pos.BasePosTagger
    public String getName() {
        return TAGGER_NAME;
    }

    public static void main(String[] strArr) {
    }
}
