package ws.palladian.extraction.entity;

import com.aliasi.util.Strings;
import edu.stanford.nlp.classify.LinearClassifier;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/entity/FileFormatParser.class */
public final class FileFormatParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(FileFormatParser.class);
    private static final int WINDOW_SIZE = 40;

    private FileFormatParser() {
    }

    public static Set<String> getTagsFromColumnFile(String str, final String str2) {
        final HashSet hashSet = new HashSet();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.1
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str3, int i) {
                if (str3.length() == 0) {
                    return;
                }
                String[] split = str3.split(str2);
                if (split.length != 2) {
                    return;
                }
                hashSet.add(split[split.length - 1]);
            }
        });
        return hashSet;
    }

    private static String getTextFromXML(String str) {
        return HtmlHelper.stripHtmlTags(FileHelper.readFileToString(str));
    }

    public static String getText(String str, TaggingFormat taggingFormat) {
        if (taggingFormat.equals(TaggingFormat.XML)) {
            return getTextFromXML(str);
        }
        if (!taggingFormat.equals(TaggingFormat.COLUMN)) {
            return "";
        }
        String appendToFileName = FileHelper.appendToFileName(str, "_temp");
        columnToXml(str, appendToFileName, LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        return getText(appendToFileName, TaggingFormat.XML);
    }

    public static void columnToXml(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        final String[] strArr = {"o"};
        final boolean[] zArr = {true};
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.2
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length >= 2 || sb.length() != 0) {
                    if (split.length < 2) {
                        if (str4.length() == 0) {
                            if (!strArr[0].equalsIgnoreCase("o") && i > 1) {
                                sb.append("</").append(strArr[0]).append(">");
                                strArr[0] = "o";
                            }
                            sb.append(FileHelper.NEWLINE_CHARACTER);
                            zArr[0] = true;
                            return;
                        }
                        return;
                    }
                    boolean z = false;
                    String str5 = split[1];
                    String str6 = split[0];
                    if (!strArr[0].equalsIgnoreCase(str5)) {
                        if (!strArr[0].equalsIgnoreCase("o") && i > 1) {
                            sb.append("</").append(strArr[0]).append(">");
                        }
                        if (!str5.equalsIgnoreCase("o")) {
                            if (i > 1 && !zArr[0]) {
                                sb.append(Strings.SINGLE_SPACE_STRING);
                            }
                            sb.append("<").append(str5).append(">");
                            z = true;
                        }
                    }
                    strArr[0] = str5;
                    if (split.length > 0 && str6.length() > 0 && ((Character.isLetterOrDigit(str6.charAt(0)) || StringHelper.isBracket(str6.charAt(0))) && !z && i > 1 && !zArr[0])) {
                        sb.append(Strings.SINGLE_SPACE_STRING);
                    }
                    sb.append(str6);
                    zArr[0] = false;
                }
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnToXmlTokenBased(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        final boolean[] zArr = {true};
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.3
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    if (str4.length() == 0) {
                        sb.append(FileHelper.NEWLINE_CHARACTER);
                        zArr[0] = true;
                        return;
                    }
                    return;
                }
                if (split.length > 0 && split[0].length() > 0 && ((Character.isLetterOrDigit(split[0].charAt(0)) || StringHelper.isBracket(split[0].charAt(0))) && i > 1 && !zArr[0])) {
                    sb.append(Strings.SINGLE_SPACE_STRING);
                }
                sb.append("<").append(split[1]).append(">");
                sb.append(split[0]);
                sb.append("</").append(split[1]).append(">");
                zArr[0] = false;
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnToBracket(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        final String[] strArr = {""};
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.4
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                boolean z = false;
                if (!strArr[0].equalsIgnoreCase(split[1])) {
                    if (!strArr[0].equalsIgnoreCase("o") && i > 1) {
                        sb.append(" ]");
                    }
                    if (!split[1].equalsIgnoreCase("o")) {
                        if (i > 1) {
                            sb.append(Strings.SINGLE_SPACE_STRING);
                        }
                        sb.append("[").append(split[1]).append(Strings.SINGLE_SPACE_STRING);
                        z = true;
                    }
                }
                strArr[0] = split[1];
                if (Character.isLetterOrDigit(split[0].charAt(0)) && !z) {
                    sb.append(Strings.SINGLE_SPACE_STRING);
                }
                sb.append(split[0]);
            }
        });
        FileHelper.writeToFile(str2, sb.toString());
    }

    public static void columnToColumnBio(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        final String[] strArr = {""};
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.5
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                int length = split.length - 1;
                String str5 = "";
                for (int i2 = 0; i2 < length; i2++) {
                    if (i2 > 0) {
                        str5 = str5 + str3;
                    }
                    str5 = str5 + split[i2];
                }
                String str6 = "O";
                if (!split[length].equalsIgnoreCase("o")) {
                    if (!strArr[0].equalsIgnoreCase(split[length])) {
                        str6 = "B-" + split[length];
                    } else if (strArr[0].equalsIgnoreCase(split[length])) {
                        str6 = "I-" + split[length];
                    }
                }
                strArr[0] = split[length];
                sb.append(str5).append(str3).append(str6).append(FileHelper.NEWLINE_CHARACTER);
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnBioToColumn(String str, String str2, final String str3) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.6
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str4, int i) {
                String[] split = str4.split(str3);
                if (split.length < 2) {
                    return;
                }
                sb.append(split[0]).append(str3).append(split[1].replaceFirst("B-", "").replaceFirst("I-", "")).append(FileHelper.NEWLINE_CHARACTER);
            }
        });
        FileHelper.writeToFile(str2, sb.toString());
    }

    public static void xmlToColumn(String str, String str2, String str3) {
        FileHelper.writeToFile(str2, xmlToColumnText(FileHelper.readFileToString(str), str3));
    }

    public static String xmlToColumnText(String str, String str2) {
        StringBuilder sb = new StringBuilder();
        for (String str3 : str.split(FileHelper.NEWLINE_CHARACTER)) {
            String str4 = "O";
            for (String str5 : Tokenizer.tokenize(str3)) {
                if (str5.startsWith("</")) {
                    str4 = "O";
                } else if (str5.startsWith("<")) {
                    str4 = StringHelper.getSubstringBetween(str5, "<", ">");
                } else {
                    sb.append(str5).append(str2).append(str4).append(FileHelper.NEWLINE_CHARACTER);
                }
            }
            sb.append(FileHelper.NEWLINE_CHARACTER);
        }
        return sb.toString();
    }

    public static void slashToXml(String str, String str2) {
        slashToColumn(str, str2, LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        columnToXml(str2, str2, LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
    }

    public static void slashToColumn(String str, String str2, String str3) {
        StringBuilder sb = new StringBuilder();
        Matcher matcher = Pattern.compile("(.+?)/([A-Z0-9_]{1,100}?)\\s", 32).matcher(FileHelper.readFileToString(str));
        while (matcher.find()) {
            sb.append(matcher.group(1));
            sb.append(str3);
            sb.append(matcher.group(2));
            sb.append(FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.writeToFile(str2, sb);
    }

    public static void columnToSlash(String str, String str2, String str3) {
        columnToSlash(str, str2, str3, "|");
    }

    public static void columnToSlash(String str, String str2, final String str3, final String str4) {
        final StringBuilder sb = new StringBuilder();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.extraction.entity.FileFormatParser.7
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str5, int i) {
                String[] split = str5.split(str3);
                if (split.length < 2) {
                    return;
                }
                sb.append(split[0]).append(str4).append(split[1]).append(Strings.SINGLE_SPACE_STRING);
            }
        });
        FileHelper.writeToFile(str2, sb);
    }

    public static void bracketToXml(String str, String str2) {
        FileHelper.writeToFile(str2, bracketToXmlText(FileHelper.readFileToString(str)));
    }

    public static String bracketToXmlText(String str) {
        String str2 = str;
        Matcher matcher = Pattern.compile("\\[(\\w+)\\s(.+?)(\\s(.+?))*?\\s{1,2}\\]", 34).matcher(str);
        while (matcher.find()) {
            String substringBetween = StringHelper.getSubstringBetween(matcher.group(0), "[", Strings.SINGLE_SPACE_STRING);
            str2 = str2.replace(matcher.group(0), "<" + substringBetween + ">" + StringHelper.getSubstringBetween(matcher.group(0), Strings.SINGLE_SPACE_STRING, " ]").trim() + "</" + substringBetween + ">");
        }
        return str2;
    }

    public static void bracketToColumn(String str, String str2, String str3) {
        bracketToXml(str, str2);
        xmlToColumn(str2, str2, str3);
    }

    public static void columnTrainingToTest(String str, String str2, String str3) {
        FileHelper.writeToFile(str2, FileHelper.readFileToString(str).replaceAll(str3, str3 + str3));
    }

    public static void removeWhiteSpaceInFirstColumn(String str, String str2, String str3) {
        FileHelper.writeToFile(str2, FileHelper.readFileToString(str).replace(Strings.SINGLE_SPACE_STRING, str3));
    }

    public static void tsvToSsv(String str, String str2) {
        FileHelper.writeToFile(str2, FileHelper.readFileToString(str).replaceAll("\\t", Strings.SINGLE_SPACE_STRING));
    }

    public static void textToColumn(String str, String str2, String str3) {
        List<String> list = Tokenizer.tokenize(FileHelper.readFileToString(str));
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(str3).append("X").append(FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.writeToFile(str2, sb);
    }

    public static Annotations<ContextAnnotation> getAnnotations(String str, TaggingFormat taggingFormat) {
        if (taggingFormat.equals(TaggingFormat.XML)) {
            return getAnnotationsFromXmlFile(str);
        }
        if (taggingFormat.equals(TaggingFormat.COLUMN)) {
            return getAnnotationsFromColumn(str);
        }
        LOGGER.error("format {} not supported for getAnnotations", taggingFormat);
        return null;
    }

    public static Annotations<ContextAnnotation> getAnnotationsFromColumn(String str) {
        columnToXml(str, FileHelper.appendToFileName(str, "_t"), LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        return getAnnotationsFromXmlFile(FileHelper.appendToFileName(str, "_t"));
    }

    public static Annotations<ContextAnnotation> getAnnotationsFromColumnTokenBased(String str) {
        columnToXmlTokenBased(str, FileHelper.appendToFileName(str, "_t"), LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        return getAnnotationsFromXmlFile(FileHelper.appendToFileName(str, "_t"));
    }

    public static Annotations<ContextAnnotation> getAnnotationsFromXmlText(String str) {
        Annotations<ContextAnnotation> annotations = new Annotations<>();
        int i = 0;
        Matcher matcher = Pattern.compile("\\<([A-Z]+)\\>(.{1,1000}?)\\</\\1\\>", 34).matcher(str);
        while (matcher.find()) {
            String trim = HtmlHelper.stripHtmlTags(str.substring(Math.max(0, matcher.start() - 40), matcher.start())).trim();
            String trim2 = HtmlHelper.stripHtmlTags(str.substring(matcher.end(), Math.min(str.length(), matcher.end() + 40))).trim();
            String group = matcher.group(1);
            String group2 = matcher.group(2);
            int countTagLength = HtmlHelper.countTagLength(group2);
            String replaceAll = HtmlHelper.stripHtmlTags(group2).replaceAll(FileHelper.NEWLINE_CHARACTER, "");
            int length = group.length() + 2;
            int i2 = i + length;
            annotations.add((Annotations<ContextAnnotation>) new ContextAnnotation((matcher.start() + length) - i2, replaceAll, group, trim, trim2));
            i = i2 + countTagLength + group.length() + 3;
        }
        return annotations;
    }

    public static Annotations<ContextAnnotation> getAnnotationsFromXmlFile(String str) {
        return getAnnotationsFromXmlText(FileHelper.readFileToString(str));
    }

    public static Annotations<ContextAnnotation> getSeedAnnotations(String str, int i) {
        Annotations<ContextAnnotation> annotations = new Annotations<>();
        CountMap create = CountMap.create();
        HashSet hashSet = new HashSet();
        Iterator<T> it = getAnnotationsFromColumn(str).iterator();
        while (it.hasNext()) {
            ContextAnnotation contextAnnotation = (ContextAnnotation) it.next();
            String tag = contextAnnotation.getTag();
            if (create.getCount(tag) < i || i == -1) {
                if (!hashSet.contains(contextAnnotation.getValue())) {
                    annotations.add((Annotations<ContextAnnotation>) contextAnnotation);
                    hashSet.add(contextAnnotation.getValue());
                    create.add(tag);
                }
            }
        }
        return annotations;
    }

    public static void main(String[] strArr) {
        CollectionHelper.print(getAnnotationsFromXmlText("asdfasdf <CITY role=\"main\">Dresden</CITY> asdfasdf asdf asdf <C>Berlin</C> asdfk <CITY>Berlin</CITY>"));
        System.exit(0);
        columnToXml("data/temp/columnFormat.tsv", "data/temp/xmlFormat.xml", "\\t");
        xmlToColumn("data/temp/xmlFormat.xml", "data/temp/columnFormat2.tsv", "\\t");
        xmlToColumn("data/temp/allTagged.xml", "data/temp/allTaggedColumn.tsv", "\\t");
        xmlToColumn("data/datasets/ner/mobilephone/text/all.xml", "data/datasets/ner/mobilephone/text/allColumn.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        columnTrainingToTest("data/temp/allColumn.tsv", "data/temp/allColumnTest.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        columnToColumnBio("data/temp/allColumn.tsv", "data/temp/allColumnBIO.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        columnToBracket("data/temp/allColumn.tsv", "data/temp/allBracket.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        bracketToXml("data/temp/allBracket.tsv", "data/temp/allXMLFromBracket.tsv");
        bracketToColumn("data/temp/allBracket.tsv", "data/temp/allColumnFromBracket.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        columnToXml("data/temp/allColumn.tsv", "data/temp/allXML.xml", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        xmlToColumn("data/temp/allXML.xml", "data/temp/allColumnFromXML.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        slashToXml("data/temp/slashedText.txt", "data/temp/xmlFromSlashed.xml");
        slashToColumn("data/temp/slashedText.txt", "data/temp/columnFromSlashed.tsv", LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
        CollectionHelper.print(getAnnotationsFromXmlFile("data/temp/xmlFromSlashed.xml"));
    }
}
