package ws.palladian.extraction.content;

import com.aliasi.util.Strings;
import edu.stanford.nlp.classify.LinearClassifier;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jdesktop.swingx.JXLabel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.helper.ProgressHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.ConstantFactory;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.collection.LazyMap;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.helper.nlp.LevenshteinSimilarity;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/content/ContentExtractionEvaluation.class */
public final class ContentExtractionEvaluation {
    private static final Logger LOGGER = LoggerFactory.getLogger(ContentExtractionEvaluation.class);
    private final Mode mode;
    private final String datasetPath;
    private final List<WebPageContentExtractor> extractors;

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/content/ContentExtractionEvaluation$Mode.class */
    public enum Mode {
        MAIN_CONTENT,
        WHOLE_CONTENT
    }

    public ContentExtractionEvaluation(String str, Mode mode, List<WebPageContentExtractor> list) {
        this.datasetPath = str;
        this.mode = mode;
        this.extractors = list;
    }

    public void evaluate(Map<String, String> map, String str) {
        FileHelper.delete(str);
        boolean z = true;
        CountMap create = CountMap.create();
        CountMap create2 = CountMap.create();
        LazyMap create3 = LazyMap.create(ConstantFactory.create(Double.valueOf(JXLabel.NORMAL)));
        int size = map.size();
        int i = 0;
        for (Map.Entry<String, String> entry : map.entrySet()) {
            int i2 = i;
            i++;
            ProgressHelper.printProgress(i2, size, JXLabel.NORMAL);
            LinkedHashMap<WebPageContentExtractor, Float> evaluate = evaluate(entry.getKey());
            if (z) {
                String str2 = "UUID\tURL\t";
                Iterator<WebPageContentExtractor> it = evaluate.keySet().iterator();
                while (it.hasNext()) {
                    str2 = str2 + it.next().getExtractorName() + LinearClassifier.TEXT_SERIALIZATION_DELIMITER;
                }
                FileHelper.appendFile(str, str2 + FileHelper.NEWLINE_CHARACTER);
                z = false;
            }
            String str3 = (entry.getKey() + LinearClassifier.TEXT_SERIALIZATION_DELIMITER) + entry.getValue() + LinearClassifier.TEXT_SERIALIZATION_DELIMITER;
            WebPageContentExtractor webPageContentExtractor = null;
            float f = -1.0f;
            for (Map.Entry<WebPageContentExtractor, Float> entry2 : evaluate.entrySet()) {
                Float value = entry2.getValue();
                WebPageContentExtractor key = entry2.getKey();
                str3 = str3 + (value.floatValue() != -1.0f ? value : "### fail ### ") + LinearClassifier.TEXT_SERIALIZATION_DELIMITER;
                if (value.floatValue() > f) {
                    f = value.floatValue();
                    webPageContentExtractor = key;
                }
                if (value.floatValue() == -1.0f) {
                    create2.add(key);
                } else {
                    create3.put(key, Double.valueOf(((Double) create3.get(key)).doubleValue() + value.floatValue()));
                }
            }
            if (webPageContentExtractor != null) {
                create.add(webPageContentExtractor);
            }
            FileHelper.appendFile(str, str3 + FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.appendFile(str, "------------- stats ------------------\n");
        for (WebPageContentExtractor webPageContentExtractor2 : this.extractors) {
            FileHelper.appendFile(str, (Strings.SINGLE_SPACE_STRING + webPageContentExtractor2.getExtractorName() + "\t#wins:" + create.getCount(webPageContentExtractor2) + "\t#errors:" + create2.getCount(webPageContentExtractor2) + "\tavg. score:" + (((Double) create3.get(webPageContentExtractor2)).doubleValue() / map.size())) + FileHelper.NEWLINE_CHARACTER);
        }
    }

    private LinkedHashMap<WebPageContentExtractor, Float> evaluate(String str) {
        LinkedHashMap<WebPageContentExtractor, Float> linkedHashMap = new LinkedHashMap<>();
        String realText = getRealText(str);
        String str2 = this.datasetPath + "/original/" + str + ".html";
        for (WebPageContentExtractor webPageContentExtractor : this.extractors) {
            float f = -1.0f;
            try {
                f = (float) getScore(realText, webPageContentExtractor.setDocument(str2).getResultText());
            } catch (Exception e) {
            }
            linkedHashMap.put(webPageContentExtractor, Float.valueOf(f));
        }
        return linkedHashMap;
    }

    private String getRealText(String str) {
        StringBuilder sb = new StringBuilder();
        String str2 = this.datasetPath + "/annotated/" + str + ".html";
        Document document = null;
        try {
            document = ParserFactory.createHtmlParser().parse(new File(str2));
        } catch (ParserException e) {
            LOGGER.warn("Error parsing " + str2);
        }
        if (document != null) {
            Iterator<Node> it = XPathHelper.getXhtmlNodes(document, this.mode == Mode.MAIN_CONTENT ? "//text()[ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel2']" : "//text()[ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel2' or ancestor::*[contains(@class,'x-nc-sel')][1]/@class='x-nc-sel5']").iterator();
            while (it.hasNext()) {
                sb.append(it.next().getTextContent()).append(Strings.SINGLE_SPACE_STRING);
            }
        }
        return sb.toString();
    }

    private double getScore(String str, String str2) {
        return new LevenshteinSimilarity().getSimilarity(normalizeString(str), normalizeString(str2));
    }

    public Map<String, String> readIndexFile() {
        final Pattern compile = Pattern.compile("<urn:uuid:([a-z0-9\\-]*?)>\\s(.*?)");
        final HashMap hashMap = new HashMap();
        FileHelper.performActionOnEveryLine(this.datasetPath + "/url-mapping.txt", new LineAction() { // from class: ws.palladian.extraction.content.ContentExtractionEvaluation.1
            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str, int i) {
                Matcher matcher = compile.matcher(str);
                if (matcher.matches() && matcher.groupCount() == 2) {
                    hashMap.put(matcher.group(1), matcher.group(2));
                }
            }
        });
        return hashMap;
    }

    private static String normalizeString(String str) {
        return StringHelper.replaceProtectedSpace(str).replace(FileHelper.NEWLINE_CHARACTER, Strings.SINGLE_SPACE_STRING).replaceAll(" {2,}", Strings.SINGLE_SPACE_STRING);
    }

    public static void main(String[] strArr) {
        ArrayList newArrayList = CollectionHelper.newArrayList();
        newArrayList.add(new ReadabilityContentExtractor());
        newArrayList.add(new PalladianContentExtractor());
        ContentExtractionEvaluation contentExtractionEvaluation = new ContentExtractionEvaluation("/Users/pk/Dropbox/Uni/Datasets/L3S-GN1-20100130203947-00001", Mode.MAIN_CONTENT, newArrayList);
        contentExtractionEvaluation.evaluate(contentExtractionEvaluation.readIndexFile(), "data/evaluation/ContentExtractionEvaluation_mainContentOnly.tsv");
    }
}
