package ws.palladian.classification.webpage;

import com.aliasi.xml.XHtmlWriter;
import edu.smu.tspell.wordnet.impl.file.SenseKey;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.StringInputStream;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/classification/webpage/PageTypeClassifier.class */
public class PageTypeClassifier extends RuleBasedPageClassifier<PageType> {
    private static final Logger LOGGER = LoggerFactory.getLogger(PageTypeClassifier.class);

    public PageType classify(Document document) {
        extractFeatures(document);
        LOGGER.info("starting to classify a new document");
        String str = getMetaTags().get("generator");
        if (str != null) {
            String lowerCase = str.toLowerCase();
            if (lowerCase.indexOf("wordpress") > -1 || lowerCase.indexOf("blogger") > -1) {
                return PageType.BLOG;
            }
            if (lowerCase.indexOf("vbulletin") > -1 || lowerCase.indexOf("phpbb") > -1) {
                return PageType.FORUM;
            }
        }
        if (getMetaTags().get("copyright") != null && getMetaTags().get("copyright").toLowerCase().indexOf("phpbb") > -1) {
            return PageType.FORUM;
        }
        for (Node node : XPathHelper.getXhtmlNodes(document, "//LINK")) {
            if (node.getAttributes().getNamedItem(XHtmlWriter.REL) != null && node.getAttributes().getNamedItem("title") != null && node.getAttributes().getNamedItem("title").getTextContent().toLowerCase().indexOf("phpbb") > -1) {
                return PageType.FORUM;
            }
        }
        return getPageTitle().toLowerCase().indexOf("google groups") > -1 ? PageType.FORUM : PageType.GENERIC;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // ws.palladian.classification.webpage.RuleBasedPageClassifier
    public PageType classify(String str) {
        Document document = null;
        try {
            document = ParserFactory.createHtmlParser().parse(new StringInputStream(str));
            document.setDocumentURI("http://net-clipping.de");
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return classify(document);
    }

    public PageType classify(File file) {
        return classify(new DocumentRetriever().getWebDocument(file.getPath()));
    }

    public PageType classify(URL url) {
        return classify(new DocumentRetriever().getWebDocument(url.toString()));
    }

    public boolean isBlog(File file) {
        return isBlog(new DocumentRetriever().getWebDocument(file.getPath()));
    }

    public boolean isBlog(String str) {
        Document document = null;
        try {
            document = ParserFactory.createHtmlParser().parse(new StringInputStream(str));
            document.setDocumentURI("http://net-clipping.de");
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return isBlog(document);
    }

    public boolean isBlog(Document document) {
        return classify(document).equals(PageType.BLOG);
    }

    public static void main(String[] strArr) throws IOException {
        StopWatch stopWatch = new StopWatch();
        PageTypeClassifier pageTypeClassifier = new PageTypeClassifier();
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        hashMap2.put("data/test/pagetype/content/blog", PageType.BLOG);
        hashMap2.put("data/test/pagetype/content/forum", PageType.FORUM);
        hashMap2.put("data/test/pagetype/content/generic", PageType.GENERIC);
        hashMap2.put("data/test/pagetype/overview/blog", PageType.BLOG);
        hashMap2.put("data/test/pagetype/overview/forum", PageType.FORUM);
        hashMap2.put("data/test/pagetype/overview/generic", PageType.GENERIC);
        hashMap2.put("data/test/pagetype/search/blog", PageType.BLOG);
        hashMap2.put("data/test/pagetype/search/forum", PageType.FORUM);
        hashMap2.put("data/test/pagetype/search/generic", PageType.GENERIC);
        hashMap2.put("data/test/pagetype/spam/blog", PageType.BLOG);
        hashMap2.put("data/test/pagetype/spam/forum", PageType.FORUM);
        hashMap2.put("data/test/pagetype/spam/generic", PageType.GENERIC);
        for (Map.Entry entry : hashMap2.entrySet()) {
            for (File file : FileHelper.getFiles((String) entry.getKey())) {
                if (file.getAbsolutePath().indexOf(".svn") <= -1 && !file.isDirectory()) {
                    hashMap.put(file.getAbsolutePath(), entry.getValue());
                }
            }
        }
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        for (Map.Entry entry2 : hashMap.entrySet()) {
            if (((String) entry2.getKey()).indexOf(".svn") <= -1) {
                boolean isBlog = pageTypeClassifier.isBlog(new File((String) entry2.getKey()));
                if (isBlog && ((PageType) entry2.getValue()).equals(PageType.BLOG)) {
                    i2++;
                } else if (((PageType) entry2.getValue()).equals(PageType.BLOG)) {
                    i3++;
                } else if (isBlog) {
                    i4++;
                } else {
                    i2++;
                }
                PageType classify = pageTypeClassifier.classify(new File((String) entry2.getKey()));
                if (classify.equals(entry2.getValue())) {
                    i++;
                    LOGGER.info("CORRECT (as " + entry2.getValue() + "): " + ((String) entry2.getKey()));
                } else {
                    LOGGER.info("WRONG (as " + classify + ", should be " + entry2.getValue() + "): " + ((String) entry2.getKey()));
                }
            }
        }
        LOGGER.info("correctly classified: " + MathHelper.round((100 * i) / hashMap.size(), 2) + SenseKey.LEMMA_TERMINATOR);
        LOGGER.info("correctly classified just blog: " + MathHelper.round((100 * i2) / hashMap.size(), 2) + SenseKey.LEMMA_TERMINATOR);
        LOGGER.info("false positive blog rate: " + MathHelper.round((100 * i4) / hashMap.size(), 2) + SenseKey.LEMMA_TERMINATOR);
        LOGGER.info("false negative blog rate: " + MathHelper.round((100 * i3) / hashMap.size(), 2) + SenseKey.LEMMA_TERMINATOR);
        LOGGER.info("classification took " + stopWatch.getElapsedTimeString() + " on " + hashMap.size() + " documents");
    }
}
