package ws.palladian.extraction;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import ws.palladian.extraction.feature.DuplicateTokenRemover;
import ws.palladian.extraction.feature.LengthTokenRemover;
import ws.palladian.extraction.feature.RegExTokenRemover;
import ws.palladian.extraction.feature.StemmerAnnotator;
import ws.palladian.extraction.feature.StopTokenRemover;
import ws.palladian.extraction.feature.TokenMetricsCalculator;
import ws.palladian.extraction.token.BaseTokenizer;
import ws.palladian.extraction.token.RegExTokenizer;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.processing.DocumentUnprocessableException;
import ws.palladian.processing.ProcessingPipeline;
import ws.palladian.processing.TextDocument;
import ws.palladian.processing.features.ListFeature;
import ws.palladian.processing.features.NominalFeature;
import ws.palladian.processing.features.NumericFeature;
import ws.palladian.processing.features.PositionAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/StemmedTokenExtractor.class */
public class StemmedTokenExtractor extends ProcessingPipeline {
    public StemmedTokenExtractor(Language language) {
        add(new RegExTokenizer());
        add(new StemmerAnnotator(language));
        add(new StopTokenRemover(language));
        add(new LengthTokenRemover(2));
        add(new RegExTokenRemover("[A-Za-z0-9\\.]+"));
        add(new TokenMetricsCalculator());
        add(new DuplicateTokenRemover());
    }

    public Map<String, Double> getTokens(String str) {
        try {
            TextDocument textDocument = (TextDocument) process(new TextDocument(str));
            HashMap hashMap = new HashMap();
            for (PositionAnnotation positionAnnotation : (List) textDocument.get(ListFeature.class, BaseTokenizer.PROVIDED_FEATURE)) {
                hashMap.put(((NominalFeature) positionAnnotation.getFeatureVector().get(NominalFeature.class, StemmerAnnotator.STEM)).getValue(), ((NumericFeature) positionAnnotation.getFeatureVector().get(NumericFeature.class, TokenMetricsCalculator.FREQUENCY)).getValue());
            }
            return hashMap;
        } catch (DocumentUnprocessableException e) {
            throw new IllegalArgumentException(e);
        }
    }

    public static void main(String[] strArr) {
        CollectionHelper.print(new StemmedTokenExtractor(Language.GERMAN).getTokens("Die vom Verein für Internet-Benutzer Österreichs gestartete Bürgerinitiative hat bereits 4.471 Unterschriften auf Papier gesammelt und ans Parlament übermittelt. Nun muss sich der Nationalratsausschuss für Petitionen und Bürgerinitiativen damit befassen."));
    }
}
