package ws.palladian.extraction.phrase;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.ChunkFactory;
import com.aliasi.chunk.Chunking;
import com.aliasi.chunk.ChunkingImpl;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.tag.Tagging;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.util.FastCache;
import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.slf4j.Marker;
import ws.palladian.helper.Cache;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.processing.features.Annotation;
import ws.palladian.processing.features.ImmutableAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/phrase/LingPipePhraseChunker.class */
public final class LingPipePhraseChunker implements PhraseChunker {
    private static final Set<String> DETERMINER_TAGS = new HashSet();
    private static final Set<String> ADJECTIVE_TAGS = new HashSet();
    private static final Set<String> NOUN_TAGS = new HashSet();
    private static final Set<String> PRONOUN_TAGS = new HashSet();
    private static final Set<String> ADVERB_TAGS = new HashSet();
    private static final Set<String> VERB_TAGS = new HashSet();
    private static final Set<String> AUXILIARY_VERB_TAGS = new HashSet();
    private static final Set<String> PUNCTUATION_TAGS = new HashSet();
    private static final Set<String> START_VERB_TAGS = new HashSet();
    private static final Set<String> CONTINUE_VERB_TAGS = new HashSet();
    private static final Set<String> START_NOUN_TAGS = new HashSet();
    private static final Set<String> CONTINUE_NOUN_TAGS = new HashSet();
    private static final String PHRASE_CHUNKER_NAME = "LingPipe Phrase Chunker";
    private final HiddenMarkovModel model;

    public LingPipePhraseChunker(File file) {
        this.model = loadModel(file);
    }

    private Chunking chunk(char[] cArr, int i, int i2) {
        int i3;
        int i4;
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        IndoEuropeanTokenizerFactory.INSTANCE.tokenizer(cArr, i, i2 - i).tokenize(arrayList, arrayList2);
        String[] strArr = (String[]) arrayList.toArray(new String[arrayList.size()]);
        String[] strArr2 = (String[]) arrayList2.toArray(new String[arrayList2.size()]);
        Integer num = 100;
        Tagging<String> tag = new HmmDecoder(this.model, null, new FastCache(num.intValue())).tag(arrayList);
        ChunkingImpl chunkingImpl = new ChunkingImpl(cArr, i, i2);
        int i5 = 0;
        int i6 = 0;
        while (i6 < tag.size()) {
            int length = i5 + strArr2[i6].length();
            if (START_NOUN_TAGS.contains(tag.tag(i6))) {
                int i7 = length;
                int length2 = strArr[i6].length();
                while (true) {
                    i3 = i7 + length2;
                    i6++;
                    if (i6 >= strArr.length || !CONTINUE_NOUN_TAGS.contains(tag.tag(i6))) {
                        break;
                    }
                    i7 = i3;
                    length2 = strArr2[i6].length() + strArr[i6].length();
                }
                int i8 = i3;
                int i9 = i6;
                while (true) {
                    i9--;
                    if (i9 < 0 || !PUNCTUATION_TAGS.contains(tag.tag(i9))) {
                        break;
                    }
                    i8 -= strArr2[i9].length() + strArr[i9].length();
                }
                if (length >= i8) {
                    i5 = i3;
                } else {
                    chunkingImpl.add(ChunkFactory.createChunk(length, i8, "NP"));
                    i5 = i3;
                }
            } else if (START_VERB_TAGS.contains(tag.tag(i6))) {
                int i10 = length;
                int length3 = strArr[i6].length();
                while (true) {
                    i4 = i10 + length3;
                    i6++;
                    if (i6 >= strArr.length || !CONTINUE_VERB_TAGS.contains(tag.tag(i6))) {
                        break;
                    }
                    i10 = i4;
                    length3 = strArr2[i6].length() + strArr[i6].length();
                }
                int i11 = i4;
                int i12 = i6;
                while (true) {
                    i12--;
                    if (i12 < 0 || !PUNCTUATION_TAGS.contains(tag.tag(i12))) {
                        break;
                    }
                    i11 -= strArr2[i12].length() + strArr[i12].length();
                }
                if (length >= i11) {
                    i5 = i4;
                } else {
                    chunkingImpl.add(ChunkFactory.createChunk(length, i11, "VP"));
                    i5 = i4;
                }
            } else {
                i5 = length + strArr[i6].length();
                i6++;
            }
        }
        return chunkingImpl;
    }

    @Override // ws.palladian.extraction.phrase.PhraseChunker
    public List<Annotation> chunk(String str) {
        char[] charArray = Strings.toCharArray(str);
        Chunking chunk = chunk(charArray, 0, charArray.length);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        for (Chunk chunk2 : chunk.chunkSet()) {
            newArrayList.add(new ImmutableAnnotation(chunk2.start(), str.substring(chunk2.start(), chunk2.end()), chunk2.type()));
        }
        return newArrayList;
    }

    private final HiddenMarkovModel loadModel(File file) {
        String absolutePath = file.getAbsolutePath();
        HiddenMarkovModel hiddenMarkovModel = (HiddenMarkovModel) Cache.getInstance().getDataObject(absolutePath);
        if (hiddenMarkovModel == null) {
            ObjectInputStream objectInputStream = null;
            try {
                try {
                    objectInputStream = new ObjectInputStream(new FileInputStream(file));
                    hiddenMarkovModel = (HiddenMarkovModel) objectInputStream.readObject();
                    Cache.getInstance().putDataObject(absolutePath, hiddenMarkovModel);
                    FileHelper.close(objectInputStream);
                } catch (IOException e) {
                    throw new IllegalStateException("Error while loading model file \"" + absolutePath + "\": " + e.getMessage());
                } catch (ClassNotFoundException e2) {
                    throw new IllegalStateException("Error while loading model file \"" + absolutePath + "\": " + e2.getMessage());
                }
            } catch (Throwable th) {
                FileHelper.close(objectInputStream);
                throw th;
            }
        }
        return hiddenMarkovModel;
    }

    @Override // ws.palladian.extraction.phrase.PhraseChunker
    public String getName() {
        return PHRASE_CHUNKER_NAME;
    }

    static {
        DETERMINER_TAGS.add("abn");
        DETERMINER_TAGS.add("abx");
        DETERMINER_TAGS.add("ap");
        DETERMINER_TAGS.add("ap$");
        DETERMINER_TAGS.add("at");
        DETERMINER_TAGS.add("cd");
        DETERMINER_TAGS.add("cd$");
        DETERMINER_TAGS.add(XHtmlWriter.DT);
        DETERMINER_TAGS.add("dt$");
        DETERMINER_TAGS.add("dti");
        DETERMINER_TAGS.add("dts");
        DETERMINER_TAGS.add("dtx");
        DETERMINER_TAGS.add("od");
        ADJECTIVE_TAGS.add("jj");
        ADJECTIVE_TAGS.add("jj$");
        ADJECTIVE_TAGS.add("jjr");
        ADJECTIVE_TAGS.add("jjs");
        ADJECTIVE_TAGS.add("jjt");
        ADJECTIVE_TAGS.add(Marker.ANY_MARKER);
        ADJECTIVE_TAGS.add("ql");
        NOUN_TAGS.add("nn");
        NOUN_TAGS.add("nn$");
        NOUN_TAGS.add("nns");
        NOUN_TAGS.add("nns$");
        NOUN_TAGS.add("np");
        NOUN_TAGS.add("np$");
        NOUN_TAGS.add("nps");
        NOUN_TAGS.add("nps$");
        NOUN_TAGS.add("nr");
        NOUN_TAGS.add("nr$");
        NOUN_TAGS.add("nrs");
        PRONOUN_TAGS.add("pn");
        PRONOUN_TAGS.add("pn$");
        PRONOUN_TAGS.add("pp$");
        PRONOUN_TAGS.add("pp$$");
        PRONOUN_TAGS.add("ppl");
        PRONOUN_TAGS.add("ppls");
        PRONOUN_TAGS.add("ppo");
        PRONOUN_TAGS.add("pps");
        PRONOUN_TAGS.add("ppss");
        VERB_TAGS.add("vb");
        VERB_TAGS.add("vbd");
        VERB_TAGS.add("vbg");
        VERB_TAGS.add("vbn");
        VERB_TAGS.add("vbz");
        AUXILIARY_VERB_TAGS.add("to");
        AUXILIARY_VERB_TAGS.add("md");
        AUXILIARY_VERB_TAGS.add("be");
        AUXILIARY_VERB_TAGS.add("bed");
        AUXILIARY_VERB_TAGS.add("bedz");
        AUXILIARY_VERB_TAGS.add("beg");
        AUXILIARY_VERB_TAGS.add("bem");
        AUXILIARY_VERB_TAGS.add("ben");
        AUXILIARY_VERB_TAGS.add("ber");
        AUXILIARY_VERB_TAGS.add("bez");
        ADVERB_TAGS.add("rb");
        ADVERB_TAGS.add("rb$");
        ADVERB_TAGS.add("rbr");
        ADVERB_TAGS.add("rbt");
        ADVERB_TAGS.add("rn");
        ADVERB_TAGS.add("ql");
        ADVERB_TAGS.add(Marker.ANY_MARKER);
        PUNCTUATION_TAGS.add("'");
        PUNCTUATION_TAGS.add(".");
        PUNCTUATION_TAGS.add(Marker.ANY_MARKER);
        START_NOUN_TAGS.addAll(DETERMINER_TAGS);
        START_NOUN_TAGS.addAll(ADJECTIVE_TAGS);
        START_NOUN_TAGS.addAll(NOUN_TAGS);
        START_NOUN_TAGS.addAll(PRONOUN_TAGS);
        CONTINUE_NOUN_TAGS.addAll(START_NOUN_TAGS);
        CONTINUE_NOUN_TAGS.addAll(ADVERB_TAGS);
        CONTINUE_NOUN_TAGS.addAll(PUNCTUATION_TAGS);
        CONTINUE_NOUN_TAGS.add("cc");
        START_VERB_TAGS.addAll(VERB_TAGS);
        START_VERB_TAGS.addAll(AUXILIARY_VERB_TAGS);
        START_VERB_TAGS.addAll(ADVERB_TAGS);
        CONTINUE_VERB_TAGS.addAll(START_VERB_TAGS);
        CONTINUE_VERB_TAGS.addAll(PUNCTUATION_TAGS);
    }
}
