package ws.palladian.extraction.token;

import com.aliasi.util.Strings;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.helpers.AbsoluteTimeDateFormat;
import org.apache.log4j.spi.LocationInfo;
import ws.palladian.extraction.entity.DateAndTimeTagger;
import ws.palladian.extraction.entity.SmileyTagger;
import ws.palladian.extraction.entity.UrlTagger;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.constants.DateFormat;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.constants.RegExp;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.processing.TextDocument;
import ws.palladian.processing.features.Annotation;
import ws.palladian.processing.features.PositionAnnotation;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/token/Tokenizer.class */
public final class Tokenizer {
    public static final String SENTENCE_SPLIT_REGEX_EN = "(?<!(\\.|\\()|([A-Z]\\.[A-Z]){1,10}|St|Mr|mr|Dr|dr|Prof|Mrs|mrs|Jr|jr|vs|ca|etc| sq| ft)((\\.|\\?|\\!)(’|”|\")+(?=\\s+[A-Z])|\\.|\\?+|\\!+)(?!(\\.|[0-9]|\"|”|'|\\)|[!?]|(com|de|fr|uk|au|ca|cn|org|net)/?\\s|\\()|[A-Za-z]{1,15}\\.|[A-Za-z]{1,15}\\(\\))";
    private static final Pattern SENTENCE_SPLIT_PATTERN_EN = Pattern.compile(SENTENCE_SPLIT_REGEX_EN);
    public static final String SENTENCE_SPLIT_REGEX_DE = "(?<!(\\.|\\()|([A-Z]\\.[A-Z]){1,10}|St|[mM]r|[dD]r|Prof|[mM]s|[jJ]r|vs|ca|engl|etc|bzw|ggf|z\\.\\s?B|u\\.s\\.w|u\\.a)((\\.|\\?|\\!)(”|\")\\s[A-Z]|\\.|\\?+|\\!+)(?!(\\.|[0-9]|\"|”|'|\\)| B\\.|[!?]|(com|de|fr|uk|au|ca|cn|org|net)/?\\s|\\()|[A-Za-z]{1,15}\\.|[A-Za-z]{1,15}\\(\\))";
    private static final Pattern SENTENCE_SPLIT_PATTERN_DE = Pattern.compile(SENTENCE_SPLIT_REGEX_DE);
    public static final String TOKEN_SPLIT_REGEX = "(?:[A-Z]\\.)+|[\\p{L}\\w]+(?:[-\\.,][\\p{L}\\w]+)*|\\.[\\p{L}\\w]+|</?[\\p{L}\\w]+>|\\$\\d+\\.\\d+|[^\\w\\s<]+";
    public static final Pattern SPLIT_PATTERN = Pattern.compile(TOKEN_SPLIT_REGEX, 34);
    private static final DateFormat[] ALL_DATES_WITH_DOTS = {RegExp.DATE_EU_D_MM, RegExp.DATE_EU_D_MM_Y, RegExp.DATE_EU_D_MM_Y_T, RegExp.DATE_EU_D_MMMM, RegExp.DATE_EU_D_MMMM_Y, RegExp.DATE_EU_D_MMMM_Y_T, RegExp.DATE_EU_MM_Y, RegExp.DATE_USA_MMMM_D_Y, RegExp.DATE_USA_MMMM_D_Y_SEP, RegExp.DATE_USA_MMMM_D_Y_T, RegExp.DATE_USA_MMMM_D, RegExp.DATE_EUSA_MMMM_Y, RegExp.DATE_EUSA_YYYY_MMM_D};
    private static final UrlTagger URL_TAGGER = new UrlTagger();
    private static final DateAndTimeTagger DATE_TIME_TAGGER = new DateAndTimeTagger(ALL_DATES_WITH_DOTS);
    private static final SmileyTagger SMILEY_TAGGER = new SmileyTagger();

    private Tokenizer() {
    }

    public static List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = SPLIT_PATTERN.matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group(0));
        }
        return arrayList;
    }

    public static Collection<List<String>> getAllSpans(String[] strArr, Integer num) {
        int length = strArr.length;
        ArrayList arrayList = new ArrayList();
        int pow = (int) Math.pow(2.0d, length);
        long j = 1;
        while (true) {
            long j2 = j;
            if (j2 >= pow) {
                return arrayList;
            }
            LinkedList linkedList = new LinkedList();
            if (extractSpanRecursive(Long.valueOf(j2), strArr, linkedList, 0, Integer.valueOf(Math.max(num.intValue() - 1, 0))).booleanValue()) {
                arrayList.add(linkedList);
            }
            j = j2 + 1;
        }
    }

    private static Boolean extractSpanRecursive(Long l, String[] strArr, List<String> list, Integer num, Integer num2) {
        if (l.longValue() % 2 != 0) {
            list.add(strArr[num.intValue()]);
        }
        Long valueOf = Long.valueOf(l.longValue() / 2);
        if (valueOf.longValue() < 1) {
            return true;
        }
        if (list.size() > num2.intValue()) {
            return false;
        }
        return extractSpanRecursive(valueOf, strArr, list, Integer.valueOf(num.intValue() + 1), num2);
    }

    public static Set<String> calculateCharNGrams(String str, int i) {
        HashSet hashSet = new HashSet();
        int length = str.length();
        if (length < i) {
            return hashSet;
        }
        for (int i2 = 0; i2 <= length - i; i2++) {
            StringBuilder sb = new StringBuilder();
            for (int i3 = i2; i3 < i2 + i; i3++) {
                sb.append(str.charAt(i3));
            }
            hashSet.add(sb.toString());
        }
        return hashSet;
    }

    public static Set<String> calculateWordNGrams(String str, int i) {
        HashSet hashSet = new HashSet();
        String[] split = str.split("\\s");
        if (split.length < i) {
            return hashSet;
        }
        for (int i2 = 0; i2 <= split.length - i; i2++) {
            StringBuilder sb = new StringBuilder();
            for (int i3 = i2; i3 < i2 + i; i3++) {
                sb.append(split[i3]).append(Strings.SINGLE_SPACE_STRING);
            }
            hashSet.add(sb.toString().trim());
        }
        return hashSet;
    }

    public static List<String> calculateWordNGramsAsList(String str, int i) {
        ArrayList arrayList = new ArrayList();
        String[] filterEmptyWords = filterEmptyWords(str.split("\\s"));
        if (filterEmptyWords.length < i) {
            return arrayList;
        }
        for (int i2 = 0; i2 <= filterEmptyWords.length - i; i2++) {
            StringBuilder sb = new StringBuilder();
            for (int i3 = i2; i3 < i2 + i; i3++) {
                sb.append(filterEmptyWords[i3]).append(Strings.SINGLE_SPACE_STRING);
            }
            arrayList.add(sb.toString().trim());
        }
        return arrayList;
    }

    private static String[] filterEmptyWords(String[] strArr) {
        ArrayList arrayList = new ArrayList();
        for (String str : strArr) {
            if (!str.trim().isEmpty()) {
                arrayList.add(str);
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public static Set<String> calculateAllCharNGrams(String str, int i, int i2) {
        HashSet hashSet = new HashSet();
        for (int i3 = i; i3 <= i2; i3++) {
            hashSet.addAll(calculateCharNGrams(str, i3));
        }
        return hashSet;
    }

    public static Set<String> calculateAllWordNGrams(String str, int i, int i2) {
        HashSet hashSet = new HashSet();
        for (int i3 = i; i3 <= i2; i3++) {
            hashSet.addAll(calculateWordNGrams(str, i3));
        }
        return hashSet;
    }

    public static List<List<String>> calculateAllNGrams(String[] strArr, Integer num, Integer num2) {
        ArrayList arrayList = new ArrayList();
        for (int intValue = num.intValue(); intValue <= num2.intValue(); intValue++) {
            arrayList.addAll(calculateNGrams(strArr, Integer.valueOf(intValue)));
        }
        return arrayList;
    }

    public static List<List<String>> calculateNGrams(String[] strArr, Integer num) {
        ArrayList arrayList = new ArrayList();
        if (strArr.length < num.intValue()) {
            return arrayList;
        }
        for (int i = 0; i <= strArr.length - num.intValue(); i++) {
            ArrayList arrayList2 = new ArrayList(num.intValue());
            for (int i2 = i; i2 < i + num.intValue(); i2++) {
                arrayList2.add(strArr[i2]);
            }
            arrayList.add(arrayList2);
        }
        return arrayList;
    }

    public static String getSentence(String str, int i) {
        return getSentence(str, i, Language.ENGLISH);
    }

    public static String getSentence(String str, int i, Language language) {
        if (i < 0) {
            return str;
        }
        String str2 = "";
        for (String str3 : getSentences(str, language)) {
            if (str.indexOf(str3) > i) {
                break;
            }
            str2 = str3;
        }
        return str2;
    }

    public static List<String> getSentences(String str, boolean z) {
        return getSentences(str, z, Language.ENGLISH);
    }

    public static List<String> getSentences(String str, boolean z, Language language) {
        Pattern pattern = SENTENCE_SPLIT_PATTERN_EN;
        if (language == Language.GERMAN) {
            pattern = SENTENCE_SPLIT_PATTERN_DE;
        }
        return getSentences(str, z, pattern);
    }

    public static List<String> getSentences(String str, boolean z, Pattern pattern) {
        int i;
        List<Annotation> annotations = URL_TAGGER.getAnnotations(str);
        int i2 = 1;
        HashMap hashMap = new HashMap();
        for (Annotation annotation : annotations) {
            String str2 = "URL" + i2;
            str = str.replace(annotation.getValue(), str2);
            hashMap.put(str2, annotation.getValue());
            i2++;
        }
        List<Annotation> annotations2 = DATE_TIME_TAGGER.getAnnotations(str);
        int i3 = 1;
        HashMap hashMap2 = new HashMap();
        for (Annotation annotation2 : annotations2) {
            String str3 = AbsoluteTimeDateFormat.DATE_AND_TIME_DATE_FORMAT + i3;
            str = str.replace(annotation2.getValue(), str3);
            hashMap2.put(str3, annotation2.getValue());
            i3++;
        }
        List<Annotation> annotations3 = SMILEY_TAGGER.getAnnotations(str);
        int i4 = 1;
        HashMap hashMap3 = new HashMap();
        for (Annotation annotation3 : annotations3) {
            String str4 = SmileyTagger.SMILEY_TAG_NAME + i4;
            str = str.replace(annotation3.getValue(), str4);
            hashMap3.put(str4, annotation3.getValue());
            i4++;
        }
        ArrayList<String> arrayList = new ArrayList();
        Matcher matcher = pattern.matcher(str);
        int i5 = 0;
        while (true) {
            i = i5;
            if (!matcher.find()) {
                break;
            }
            arrayList.add(str.substring(i, matcher.end()).trim());
            i5 = matcher.end();
        }
        if (i < str.length()) {
            arrayList.add(str.substring(i).trim());
        }
        if (z) {
            ArrayList arrayList2 = new ArrayList();
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                String[] split = ((String) it.next()).split(FileHelper.NEWLINE_CHARACTER);
                String str5 = split[split.length - 1];
                if (str5.endsWith(".") || str5.endsWith(LocationInfo.NA) || str5.endsWith("!") || str5.endsWith(".”") || str5.endsWith(".\"")) {
                    String trim = StringHelper.trim(str5, "“”\"");
                    int countWhitespaces = StringHelper.countWhitespaces(trim) + 1;
                    if (trim.length() > 8 && countWhitespaces > 2) {
                        arrayList2.add(str5.trim());
                    }
                }
            }
            arrayList = arrayList2;
        }
        ArrayList<String> arrayList3 = new ArrayList();
        for (String str6 : arrayList) {
            for (Map.Entry entry : hashMap.entrySet()) {
                str6 = str6.replace((CharSequence) entry.getKey(), (CharSequence) entry.getValue());
            }
            arrayList3.add(str6);
        }
        ArrayList<String> arrayList4 = new ArrayList();
        for (String str7 : arrayList3) {
            for (Map.Entry entry2 : hashMap2.entrySet()) {
                str7 = str7.replace((CharSequence) entry2.getKey(), (CharSequence) entry2.getValue());
            }
            if (!str7.isEmpty()) {
                arrayList4.add(str7);
            }
        }
        ArrayList arrayList5 = new ArrayList();
        for (String str8 : arrayList4) {
            for (Map.Entry entry3 : hashMap3.entrySet()) {
                str8 = str8.replace((CharSequence) entry3.getKey(), (CharSequence) entry3.getValue());
            }
            if (!str8.isEmpty()) {
                arrayList5.add(str8);
            }
        }
        return arrayList5;
    }

    private static String maskAnnotations(TextDocument textDocument, List<Annotation> list, String str, List<PositionAnnotation> list2, String str2) {
        for (PositionAnnotation positionAnnotation : convert(textDocument, list)) {
            if (str2.contains(positionAnnotation.getValue())) {
                str2 = StringUtils.replaceOnce(str2, positionAnnotation.getValue(), str);
                list2.add(positionAnnotation);
            }
        }
        return str2;
    }

    public static List<PositionAnnotation> getSentences(TextDocument textDocument, String str) {
        return getSentences(textDocument, str, Language.ENGLISH);
    }

    public static List<PositionAnnotation> getSentences(TextDocument textDocument, String str, Language language) {
        Pattern pattern = SENTENCE_SPLIT_PATTERN_EN;
        if (language == Language.GERMAN) {
            pattern = SENTENCE_SPLIT_PATTERN_DE;
        }
        return getSentences(textDocument, pattern, str);
    }

    public static List<PositionAnnotation> getSentences(TextDocument textDocument, Pattern pattern, String str) {
        int i;
        String content = textDocument.getContent();
        ArrayList arrayList = new ArrayList();
        String maskAnnotations = maskAnnotations(textDocument, SMILEY_TAGGER.getAnnotations(content), "PALLADIANMASK", arrayList, maskAnnotations(textDocument, DATE_TIME_TAGGER.getAnnotations(content), "PALLADIANMASK", arrayList, maskAnnotations(textDocument, URL_TAGGER.getAnnotations(content), "PALLADIANMASK", arrayList, textDocument.getContent())));
        ArrayList arrayList2 = new ArrayList();
        Matcher matcher = pattern.matcher(maskAnnotations);
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            int end = matcher.end();
            String substring = maskAnnotations.substring(i, end);
            String ltrim = StringHelper.ltrim(substring);
            arrayList2.add(new PositionAnnotation(StringHelper.rtrim(ltrim), i + Integer.valueOf(substring.length() - ltrim.length()).intValue()));
            i2 = end;
        }
        if (i < maskAnnotations.length()) {
            String substring2 = maskAnnotations.substring(i);
            String ltrim2 = StringHelper.ltrim(substring2);
            Integer valueOf = Integer.valueOf(substring2.length() - ltrim2.length());
            String rtrim = StringHelper.rtrim(ltrim2);
            if (!rtrim.isEmpty()) {
                arrayList2.add(new PositionAnnotation(rtrim, i + valueOf.intValue()));
            }
        }
        Collections.sort(arrayList, new Comparator<PositionAnnotation>() { // from class: ws.palladian.extraction.token.Tokenizer.1
            @Override // java.util.Comparator
            public int compare(PositionAnnotation positionAnnotation, PositionAnnotation positionAnnotation2) {
                return Integer.valueOf(positionAnnotation.getStartPosition()).compareTo(Integer.valueOf(positionAnnotation2.getStartPosition()));
            }
        });
        return recalculatePositions(textDocument, maskAnnotations, arrayList, arrayList2, str);
    }

    private static List<PositionAnnotation> recalculatePositions(TextDocument textDocument, String str, List<PositionAnnotation> list, List<PositionAnnotation> list2, String str2) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        int i2 = 0;
        Pattern compile = Pattern.compile("PALLADIANMASK");
        int length = "PALLADIANMASK".length();
        int i3 = 0;
        for (PositionAnnotation positionAnnotation : list2) {
            int startPosition = i + (positionAnnotation.getStartPosition() - i2);
            int endPosition = positionAnnotation.getEndPosition() + (startPosition - positionAnnotation.getStartPosition());
            Matcher matcher = compile.matcher(positionAnnotation.getValue());
            while (matcher.find()) {
                endPosition += list.get(i3).getValue().length() - length;
                i3++;
            }
            arrayList.add(new PositionAnnotation(String.valueOf(textDocument.getContent().subSequence(startPosition, endPosition)), startPosition));
            i = endPosition;
            i2 = positionAnnotation.getEndPosition();
        }
        return arrayList;
    }

    private static List<PositionAnnotation> convert(TextDocument textDocument, List<Annotation> list) {
        ArrayList arrayList = new ArrayList();
        for (Annotation annotation : list) {
            arrayList.add(new PositionAnnotation(annotation.getValue(), annotation.getStartPosition()));
        }
        return arrayList;
    }

    public static List<String> getSentences(String str) {
        return getSentences(str, Language.ENGLISH);
    }

    public static List<String> getSentences(String str, Language language) {
        return getSentences(str, false, language);
    }

    public static String getPhraseFromBeginningOfSentence(String str) {
        String removeDoubleWhitespaces = StringHelper.removeDoubleWhitespaces(str);
        int max = Math.max(removeDoubleWhitespaces.lastIndexOf("."), removeDoubleWhitespaces.lastIndexOf(FileHelper.NEWLINE_CHARACTER));
        boolean z = false;
        while (!z && max > -1 && max < removeDoubleWhitespaces.length() - 1) {
            if (max > 0) {
                z = !StringHelper.isNumber(removeDoubleWhitespaces.charAt(max - 1)) && Character.isUpperCase(removeDoubleWhitespaces.charAt(max + 1));
            }
            if (!z && max < removeDoubleWhitespaces.length() - 2) {
                z = (Character.isUpperCase(removeDoubleWhitespaces.charAt(max + 2)) || removeDoubleWhitespaces.charAt(max + 2) == '-' || removeDoubleWhitespaces.charAt(max + 2) == '=') && removeDoubleWhitespaces.charAt(max + 1) == ' ';
            }
            if (!z && (removeDoubleWhitespaces.charAt(max + 1) == '\n' || removeDoubleWhitespaces.charAt(max) == '\n')) {
                z = true;
            }
            if (z) {
                break;
            }
            max = max < removeDoubleWhitespaces.length() - 1 ? removeDoubleWhitespaces.substring(0, max).lastIndexOf(".") : -1;
        }
        if (removeDoubleWhitespaces.lastIndexOf("!") > -1 && removeDoubleWhitespaces.lastIndexOf("!") > max) {
            max = removeDoubleWhitespaces.lastIndexOf("!");
        }
        if (removeDoubleWhitespaces.lastIndexOf(LocationInfo.NA) > -1 && removeDoubleWhitespaces.lastIndexOf(LocationInfo.NA) > max) {
            max = removeDoubleWhitespaces.lastIndexOf(LocationInfo.NA);
        }
        if (removeDoubleWhitespaces.lastIndexOf(":") > -1 && removeDoubleWhitespaces.lastIndexOf(":") > max) {
            max = removeDoubleWhitespaces.lastIndexOf(":");
        }
        if (max == -1) {
            max = -1;
        }
        String substring = removeDoubleWhitespaces.substring(max + 1);
        if (substring.startsWith(Strings.SINGLE_SPACE_STRING)) {
            substring = substring.substring(1);
        }
        return substring;
    }

    public static String getPhraseToEndOfSentence(String str) {
        int indexOf = str.indexOf(".");
        boolean z = false;
        while (!z && indexOf > -1) {
            if (indexOf > 0) {
                z = !StringHelper.isNumber(str.charAt(indexOf - 1));
            }
            if (indexOf < str.length() - 1) {
                z = (!StringHelper.isNumber(str.charAt(indexOf + 1)) && Character.isUpperCase(str.charAt(indexOf + 1))) || StringHelper.isBracket(str.charAt(indexOf + 1)) || (indexOf > 0 && str.charAt(indexOf - 1) == '\"');
            }
            if (!z && indexOf < str.length() - 2) {
                z = !StringHelper.isNumber(str.charAt(indexOf + 2)) && (Character.isUpperCase(str.charAt(indexOf + 2)) || StringHelper.isBracket(str.charAt(indexOf + 2))) && str.charAt(indexOf + 1) == ' ';
            }
            if (!z && (str.length() == indexOf + 1 || str.charAt(indexOf + 1) == '\n')) {
                z = true;
            }
            if (z) {
                break;
            }
            indexOf = indexOf < str.length() - 1 ? str.indexOf(".", indexOf + 1) : -1;
        }
        if (str.indexOf("!") > -1 && (str.indexOf("!") < indexOf || indexOf == -1)) {
            indexOf = str.indexOf("!");
        }
        if (str.indexOf(LocationInfo.NA) > -1 && (str.indexOf(LocationInfo.NA) < indexOf || indexOf == -1)) {
            indexOf = str.indexOf(LocationInfo.NA);
        }
        return str.substring(0, indexOf == -1 ? str.length() : indexOf + 1);
    }

    public static void main(String[] strArr) throws IOException {
        StopWatch stopWatch = new StopWatch();
        for (int i = 0; i < 1000; i++) {
            getSentences("Zum Einen ist das Ding ein bisschen groß und es sieht sehr merkwürdig aus, wenn man damit durch die Stadt läuft und es am Ohr hat und zum Anderen ein bisschen unhandlich.\nNun möchte ich noch etwas über die Akkulaufzeit sagen.");
        }
        System.out.println(stopWatch.getElapsedTimeString());
        System.exit(0);
        System.out.println(getSentences("the quick brown fox"));
        String readFileToString = FileHelper.readFileToString("data/test/tokenizerProblem.txt");
        int i2 = 0;
        Iterator<String> it = tokenize(readFileToString).iterator();
        while (it.hasNext()) {
            if (it.next().equals("Number")) {
                i2++;
            }
        }
        System.out.println("# occurences 1 : " + i2);
        int i3 = 0;
        for (String str : getSentences(readFileToString)) {
            FileHelper.appendFile("sentences.txt", str + FileHelper.NEWLINE_CHARACTER);
            Iterator<String> it2 = tokenize(str).iterator();
            while (it2.hasNext()) {
                if (it2.next().equals("Number")) {
                    i3++;
                }
            }
        }
        System.out.println("# occurences 2 : " + i3);
    }
}
