package ws.palladian.classification.text;

import edu.stanford.nlp.ling.CoreLabel;
import java.util.Arrays;
import org.apache.commons.lang3.Validate;
import ws.palladian.classification.text.FeatureSetting;
import ws.palladian.extraction.feature.AbstractTokenRemover;
import ws.palladian.extraction.feature.CharNGramCreator;
import ws.palladian.extraction.feature.DuplicateTokenRemover;
import ws.palladian.extraction.feature.LengthTokenRemover;
import ws.palladian.extraction.feature.LowerCaser;
import ws.palladian.extraction.feature.NGramCreator;
import ws.palladian.extraction.token.RegExTokenizer;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.processing.ProcessingPipeline;
import ws.palladian.processing.features.PositionAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/classification/text/PreprocessingPipeline.class */
public class PreprocessingPipeline extends ProcessingPipeline {

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/classification/text/PreprocessingPipeline$UnwantedTokenRemover.class */
    private static final class UnwantedTokenRemover extends AbstractTokenRemover {
        private UnwantedTokenRemover() {
        }

        @Override // ws.palladian.extraction.feature.AbstractTokenRemover
        protected boolean remove(PositionAnnotation positionAnnotation) {
            String value = positionAnnotation.getValue();
            return StringHelper.containsAny(value, Arrays.asList("&", CoreLabel.TAG_SEPARATOR, "=")) || StringHelper.isNumber(value);
        }
    }

    public PreprocessingPipeline(FeatureSetting featureSetting) {
        Validate.notNull(featureSetting, "featureSetting must not be null", new Object[0]);
        connectToPreviousProcessor(new LowerCaser());
        int minNGramLength = featureSetting.getMinNGramLength();
        int maxNGramLength = featureSetting.getMaxNGramLength();
        if (featureSetting.getTextFeatureType() == FeatureSetting.TextFeatureType.CHAR_NGRAMS) {
            connectToPreviousProcessor(new CharNGramCreator(minNGramLength, maxNGramLength, true, featureSetting.getMaxTerms()));
        } else {
            connectToPreviousProcessor(new RegExTokenizer());
            connectToPreviousProcessor(new NGramCreator(minNGramLength, maxNGramLength, new String[0]));
        }
        if (featureSetting.getTextFeatureType() == FeatureSetting.TextFeatureType.WORD_NGRAMS) {
            connectToPreviousProcessor(new LengthTokenRemover(featureSetting.getMinimumTermLength(), featureSetting.getMaximumTermLength()));
        }
        connectToPreviousProcessor(new DuplicateTokenRemover());
        connectToPreviousProcessor(new UnwantedTokenRemover());
    }
}
