package ws.palladian.extraction.location.experimental;

import edu.stanford.nlp.classify.LinearClassifier;
import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.ProcessHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.constants.SizeUnit;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.wikipedia.MultiStreamBZip2InputStream;
import ws.palladian.retrieval.wikipedia.WikipediaPage;
import ws.palladian.retrieval.wikipedia.WikipediaPageCallback;
import ws.palladian.retrieval.wikipedia.WikipediaPageContentHandler;
import ws.palladian.retrieval.wikipedia.WikipediaUtil;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/experimental/WikipediaCaseDictionaryCreator.class */
class WikipediaCaseDictionaryCreator {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaCaseDictionaryCreator.class);
    private static final CountMap<String> wordCounts = CountMap.create();
    private static final CountMap<String> uppercaseCounts = CountMap.create();

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/experimental/WikipediaCaseDictionaryCreator$StopException.class */
    private static final class StopException extends RuntimeException {
        private static final long serialVersionUID = 1;

        private StopException() {
        }
    }

    WikipediaCaseDictionaryCreator() {
    }

    public static void mineCaseDictionary(File file, File file2, final int i) {
        if (!file.isFile()) {
            throw new IllegalArgumentException(file + " is not a file or could not be accessed.");
        }
        Validate.isTrue(i > 0, "limit must be greater zero", new Object[0]);
        try {
            final int[] iArr = {0};
            SAXParserFactory.newInstance().newSAXParser().parse(new MultiStreamBZip2InputStream(new BufferedInputStream(new FileInputStream(file))), new WikipediaPageContentHandler(new WikipediaPageCallback() { // from class: ws.palladian.extraction.location.experimental.WikipediaCaseDictionaryCreator.1
                @Override // ws.palladian.retrieval.wikipedia.WikipediaPageCallback
                public void callback(WikipediaPage wikipediaPage) {
                    if (wikipediaPage.getNamespaceId() != 0) {
                        return;
                    }
                    int[] iArr2 = iArr;
                    int i2 = iArr2[0];
                    iArr2[0] = i2 + 1;
                    if (i2 == i) {
                        throw new StopException();
                    }
                    if (ProcessHelper.getFreeMemory() < SizeUnit.MEGABYTES.toBytes(128L)) {
                        WikipediaCaseDictionaryCreator.LOGGER.info("Memory nearly exhausted, stopping. Make sure to assign lots of heap memory before running!");
                        throw new StopException();
                    }
                    System.out.println(iArr[0]);
                    WikipediaCaseDictionaryCreator.addCounts(WikipediaUtil.extractSentences(StringHelper.normalizeQuotes(WikipediaUtil.stripMediaWikiMarkup(wikipediaPage.getText()))));
                }
            }));
        } catch (FileNotFoundException e) {
            throw new IllegalStateException(e);
        } catch (IOException e2) {
            throw new IllegalStateException(e2);
        } catch (ParserConfigurationException e3) {
            throw new IllegalStateException(e3);
        } catch (SAXException e4) {
            throw new IllegalStateException(e4);
        } catch (StopException e5) {
        }
        writeCaseDictionary(file2);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void addCounts(String str) {
        Iterator<String> it = Tokenizer.getSentences(str, true).iterator();
        while (it.hasNext()) {
            List<String> list = Tokenizer.tokenize(it.next());
            for (int i = 1; i < list.size(); i++) {
                String str2 = list.get(i);
                wordCounts.add(str2.toLowerCase());
                if (StringHelper.startsUppercase(str2)) {
                    uppercaseCounts.add(str2.toLowerCase());
                }
            }
        }
    }

    private static void writeCaseDictionary(File file) {
        BufferedWriter bufferedWriter = null;
        try {
            try {
                Set<String> keySet = wordCounts.keySet();
                bufferedWriter = new BufferedWriter(new FileWriter(file));
                for (String str : keySet) {
                    bufferedWriter.write(String.format("%s\t%s\t%s\n", str, Integer.valueOf(wordCounts.getCount(str)), Integer.valueOf(uppercaseCounts.getCount(str))));
                }
                FileHelper.close(bufferedWriter);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
        } catch (Throwable th) {
            FileHelper.close(bufferedWriter);
            throw th;
        }
    }

    /* JADX WARN: Finally extract failed */
    public static void clean(File file, File file2) {
        final Writer[] writerArr = new Writer[1];
        final int[] iArr = {0};
        try {
            try {
                writerArr[0] = new BufferedWriter(new FileWriter(file2));
                System.out.println("Reduced from " + FileHelper.performActionOnEveryLine(file.getPath(), new LineAction() { // from class: ws.palladian.extraction.location.experimental.WikipediaCaseDictionaryCreator.2
                    @Override // ws.palladian.helper.io.LineAction
                    public void performAction(String str, int i) {
                        try {
                            String[] split = str.split(LinearClassifier.TEXT_SERIALIZATION_DELIMITER);
                            String str2 = split[0];
                            if (Integer.valueOf(split[1]).intValue() >= 10 && str2.matches("[A-Za-z\\-]+")) {
                                writerArr[0].write(str);
                                writerArr[0].write(10);
                                int[] iArr2 = iArr;
                                iArr2[0] = iArr2[0] + 1;
                            }
                        } catch (IOException e) {
                            throw new IllegalStateException(e);
                        }
                    }
                }) + " to " + iArr[0]);
                FileHelper.close(writerArr[0]);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
        } catch (Throwable th) {
            FileHelper.close(writerArr[0]);
            throw th;
        }
    }

    public static void main(String[] strArr) {
        clean(new File("wikipediaCaseDictionary.csv"), new File("wikipediaCaseDictionaryClean.csv"));
    }
}
