package ws.palladian.classification.webpage;

import com.aliasi.xml.XHtmlWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.extraction.ListDiscoverer;
import ws.palladian.extraction.content.PalladianContentExtractor;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.PageAnalyzer;
import ws.palladian.retrieval.resources.WebImage;
import ws.palladian.retrieval.resources.WebLink;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/classification/webpage/RuleBasedPageClassifier.class */
public abstract class RuleBasedPageClassifier<T> {
    private static final Logger LOGGER = LoggerFactory.getLogger(RuleBasedPageClassifier.class);
    private String pageTitle = "";
    private String pageURL = "";
    private String pageSentences = "";
    private int highestNumberOfConsecutiveSentences = 0;
    private List<WebLink> ingoingLinks = new ArrayList();
    private List<WebLink> outgoingLinks = new ArrayList();
    private Collection<String> paginationLinks = new ArrayList();
    private Collection<WebImage> images = new HashSet();
    private Collection<String> headlineContents = new ArrayList();
    private Map<String, String> metaTags = new HashMap();

    private void reset() {
        this.pageTitle = "";
        this.pageURL = "";
        this.pageSentences = "";
        this.ingoingLinks = new ArrayList();
        this.outgoingLinks = new ArrayList();
        this.paginationLinks = new ArrayList();
        this.images = new HashSet();
        this.headlineContents = new ArrayList();
        setMetaTags(new HashMap());
    }

    public void extractFeatures(Document document) {
        reset();
        if (document == null) {
            return;
        }
        try {
            setPageTitle(PageAnalyzer.extractTitle(document));
        } catch (Exception e) {
            e.printStackTrace();
        }
        setPageURL(document.getDocumentURI());
        this.metaTags = PageAnalyzer.extractMetaInformation(document);
        String domain = UrlHelper.getDomain(getPageURL());
        Iterator<Node> it = XPathHelper.getXhtmlNodes(document, "//H1").iterator();
        while (it.hasNext()) {
            this.headlineContents.add(it.next().getTextContent());
        }
        Iterator<Node> it2 = XPathHelper.getXhtmlNodes(document, "//H2").iterator();
        while (it2.hasNext()) {
            this.headlineContents.add(it2.next().getTextContent());
        }
        Iterator<Node> it3 = XPathHelper.getXhtmlNodes(document, "//H3").iterator();
        while (it3.hasNext()) {
            this.headlineContents.add(it3.next().getTextContent());
        }
        Iterator<Node> it4 = XPathHelper.getXhtmlNodes(document, "//H4").iterator();
        while (it4.hasNext()) {
            this.headlineContents.add(it4.next().getTextContent());
        }
        Iterator<Node> it5 = XPathHelper.getXhtmlNodes(document, "//H5").iterator();
        while (it5.hasNext()) {
            this.headlineContents.add(it5.next().getTextContent());
        }
        Iterator<Node> it6 = XPathHelper.getXhtmlNodes(document, "//H6").iterator();
        while (it6.hasNext()) {
            this.headlineContents.add(it6.next().getTextContent());
        }
        for (Node node : XPathHelper.getXhtmlNodes(document, "//A")) {
            String textContent = node.getTextContent();
            String str = "";
            try {
                str = node.getAttributes().getNamedItem(XHtmlWriter.HREF).getTextContent();
            } catch (Exception e2) {
                LOGGER.debug("link does not have href");
            }
            WebLink webLink = new WebLink();
            webLink.setTitle("");
            webLink.setText(textContent);
            webLink.setUrl(str);
            if (UrlHelper.getDomain(str).equalsIgnoreCase(domain) || str.indexOf(HttpHost.DEFAULT_SCHEME_NAME) != 0) {
                this.ingoingLinks.add(webLink);
            } else {
                this.outgoingLinks.add(webLink);
            }
        }
        LOGGER.debug("Ingoing Links: " + this.ingoingLinks.size());
        LOGGER.debug("Outgoing Links: " + this.outgoingLinks.size());
        PalladianContentExtractor palladianContentExtractor = new PalladianContentExtractor();
        try {
            palladianContentExtractor.setDocument(document);
            setImages(palladianContentExtractor.getImages());
        } catch (Exception e3) {
            e3.printStackTrace();
        }
        LOGGER.debug("Images: " + this.images.size());
        setPageSentences(palladianContentExtractor.getSentencesString());
        setHighestNumberOfConsecutiveSentences(getHighestNumberOfConsecutiveSentences(palladianContentExtractor.getSentences(), HtmlHelper.documentToText(document)));
        ListDiscoverer listDiscoverer = new ListDiscoverer();
        listDiscoverer.findPaginationURLs(document);
        setPaginationLinks(listDiscoverer.getPaginationURLs());
        LOGGER.debug("Pagination Links: " + this.paginationLinks.size());
    }

    private int getHighestNumberOfConsecutiveSentences(Collection<String> collection, String str) {
        String removeControlCharacters = StringHelper.removeControlCharacters(HtmlHelper.stripHtmlTags(str));
        int i = 0;
        int i2 = 0;
        int i3 = -1;
        for (String str2 : collection) {
            int indexOf = removeControlCharacters.indexOf(str2);
            if (i3 == -1 || isWithinRange(indexOf, i3, 2.0d)) {
                i2++;
                i3 = indexOf + str2.length() + 1;
            } else {
                if (i2 > i) {
                    i = i2;
                }
                i2 = 0;
                i3 = -1;
            }
        }
        if (i2 > i) {
            i = i2;
        }
        return i;
    }

    private static boolean isWithinRange(double d, double d2, double d3) {
        return d <= d2 + d3 && d >= d2 - d3;
    }

    public void extractFeatures(String str) {
        extractFeatures(new DocumentRetriever().getWebDocument(str));
    }

    public String getPageTitle() {
        return this.pageTitle;
    }

    public void setPageTitle(String str) {
        this.pageTitle = str;
    }

    public String getPageURL() {
        return this.pageURL;
    }

    public void setPageURL(String str) {
        this.pageURL = str;
    }

    public String getPageSentences() {
        return this.pageSentences;
    }

    public void setPageSentences(String str) {
        this.pageSentences = str;
    }

    public List<WebLink> getIngoingLinks() {
        return this.ingoingLinks;
    }

    public void setIngoingLinks(List<WebLink> list) {
        this.ingoingLinks = list;
    }

    public List<WebLink> getOutgoingLinks() {
        return this.outgoingLinks;
    }

    public void setOutgoingLinks(List<WebLink> list) {
        this.outgoingLinks = list;
    }

    public Collection<String> getPaginationLinks() {
        return this.paginationLinks;
    }

    public void setPaginationLinks(Collection<String> collection) {
        this.paginationLinks = collection;
    }

    public Collection<WebImage> getImages() {
        return this.images;
    }

    public void setImages(Collection<WebImage> collection) {
        this.images = collection;
    }

    public abstract T classify(String str);

    public void setHighestNumberOfConsecutiveSentences(int i) {
        this.highestNumberOfConsecutiveSentences = i;
    }

    public int getHighestNumberOfConsecutiveSentences() {
        return this.highestNumberOfConsecutiveSentences;
    }

    public Collection<String> getHeadlineContents() {
        return this.headlineContents;
    }

    public void setHeadlineContents(Collection<String> collection) {
        this.headlineContents = collection;
    }

    public void setMetaTags(Map<String, String> map) {
        this.metaTags = map;
    }

    public Map<String, String> getMetaTags() {
        return this.metaTags;
    }
}
