package ws.palladian.classification.language.evaluation;

import com.aliasi.util.Strings;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.h2.message.Trace;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Node;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.retrieval.DocumentRetriever;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/classification/language/evaluation/JRCCorpusConverter.class */
public class JRCCorpusConverter {
    private static final Logger LOGGER = LoggerFactory.getLogger(JRCCorpusConverter.class);

    public void convertAllFiles(String str, String str2) {
        StopWatch stopWatch = new StopWatch();
        if (!str2.endsWith(CoreLabel.TAG_SEPARATOR)) {
            str2 = str2 + CoreLabel.TAG_SEPARATOR;
        }
        new File(str2).mkdirs();
        for (File file : FileHelper.getFiles(str)) {
            String folderName = FileHelper.getFolderName(file.getPath());
            LOGGER.info("converting xml files from language: " + folderName);
            int i = 1;
            for (File file2 : FileHelper.getFiles(file.getPath())) {
                LOGGER.info("converting xml files from language: " + folderName + " and year " + FileHelper.getFolderName(file2.getPath()));
                for (File file3 : FileHelper.getFiles(file2.getPath())) {
                    convertAndSave(file3, str2 + folderName + CoreLabel.TAG_SEPARATOR, i + ".txt");
                    i++;
                }
            }
        }
        LOGGER.info("converted all files in " + stopWatch.getElapsedTimeString());
    }

    private void convertAndSave(File file, String str, String str2) {
        new File(str).mkdirs();
        List<Node> nodes = XPathHelper.getNodes(new DocumentRetriever().getXmlDocument(file.getPath()), "//text/body//div[@type='body']/p");
        StringBuilder sb = new StringBuilder();
        Iterator<Node> it = nodes.iterator();
        while (it.hasNext()) {
            sb.append(it.next().getTextContent()).append(FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.writeToFile(str + CoreLabel.TAG_SEPARATOR + str2, sb);
    }

    public void createIndex(String str) throws IOException {
        createIndex(str, null);
    }

    public void createIndex(String str, String[] strArr) throws IOException {
        StopWatch stopWatch = new StopWatch();
        if (!str.endsWith(CoreLabel.TAG_SEPARATOR)) {
            str = str + CoreLabel.TAG_SEPARATOR;
        }
        FileWriter fileWriter = new FileWriter(str + (strArr == null ? Trace.INDEX + "All22Languages" : Trace.INDEX + "_" + Arrays.toString(strArr)) + ".txt");
        for (File file : FileHelper.getFiles(str)) {
            String folderName = FileHelper.getFolderName(file.getPath());
            if (strArr == null || Arrays.asList(strArr).contains(folderName)) {
                for (File file2 : FileHelper.getFiles(file.getPath())) {
                    fileWriter.write(folderName + CoreLabel.TAG_SEPARATOR + file2.getName() + Strings.SINGLE_SPACE_STRING + folderName);
                    fileWriter.write(FileHelper.NEWLINE_CHARACTER);
                    fileWriter.flush();
                }
            } else {
                LOGGER.info("skip language " + folderName);
            }
        }
        fileWriter.close();
        LOGGER.info("index file created in " + stopWatch.getElapsedTimeString());
    }

    public void createIndexExcerpt(String str, final int i) throws IOException {
        StopWatch stopWatch = new StopWatch();
        final FileWriter fileWriter = new FileWriter(FileHelper.appendToFileName(str, "_ipc" + i));
        final CountMap create = CountMap.create();
        FileHelper.performActionOnEveryLine(str, new LineAction() { // from class: ws.palladian.classification.language.evaluation.JRCCorpusConverter.1
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str2, int i2) {
                String[] split = str2.split(Strings.SINGLE_SPACE_STRING);
                if (split.length >= 2 && create.getCount(split[1]) < i) {
                    try {
                        fileWriter.write(str2 + FileHelper.NEWLINE_CHARACTER);
                    } catch (IOException e) {
                        JRCCorpusConverter.LOGGER.error(e.getMessage());
                    }
                    create.add(split[1]);
                }
            }
        });
        fileWriter.close();
        LOGGER.info("index excerpt file created in " + stopWatch.getElapsedTimeString());
    }

    public void splitIndex(String str, int i) throws IOException {
        StopWatch stopWatch = new StopWatch();
        FileWriter fileWriter = new FileWriter(FileHelper.appendToFileName(str, "_split1"));
        FileWriter fileWriter2 = new FileWriter(FileHelper.appendToFileName(str, "_split2"));
        HashMap hashMap = new HashMap();
        Iterator<String> it = FileHelper.readFileToArray(str).iterator();
        while (it.hasNext()) {
            String[] split = it.next().split(Strings.SINGLE_SPACE_STRING);
            Set set = (Set) hashMap.get(split[1]);
            if (set == null) {
                HashSet hashSet = new HashSet();
                hashSet.add(split[0]);
                hashMap.put(split[1], hashSet);
            } else {
                set.add(split[0]);
            }
        }
        for (Map.Entry entry : hashMap.entrySet()) {
            Set<String> set2 = (Set) entry.getValue();
            int size = (int) ((set2.size() * i) / 100.0d);
            int i2 = 0;
            for (String str2 : set2) {
                if (i2 < size) {
                    fileWriter.write(str2);
                    fileWriter.write(Strings.SINGLE_SPACE_STRING);
                    fileWriter.write((String) entry.getKey());
                    fileWriter.write(FileHelper.NEWLINE_CHARACTER);
                    fileWriter.flush();
                    i2++;
                } else {
                    fileWriter2.write(str2);
                    fileWriter2.write(Strings.SINGLE_SPACE_STRING);
                    fileWriter2.write((String) entry.getKey());
                    fileWriter2.write(FileHelper.NEWLINE_CHARACTER);
                    fileWriter2.flush();
                }
            }
        }
        fileWriter.close();
        fileWriter2.close();
        LOGGER.info("file " + str + " splitted in " + stopWatch.getElapsedTimeString());
    }

    public void cleanDataset(String str) {
        StopWatch stopWatch = new StopWatch();
        LOGGER.info("cleaning the dataset...");
        int i = 0;
        for (File file : FileHelper.getFiles(str)) {
            for (File file2 : FileHelper.getFiles(file.getPath())) {
                if (file2.length() == 0) {
                    file2.delete();
                    i++;
                }
            }
        }
        LOGGER.info("dataset cleansed (" + i + " files deleted) in " + stopWatch.getElapsedTimeString());
    }

    public static void main(String[] strArr) throws IOException {
        new JRCCorpusConverter().createIndexExcerpt("C:\\Safe\\Datasets\\jrc language data converted\\indexAll22Languages.txt", 1000);
    }
}
