package ws.palladian.extraction.content;

import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import com.sun.syndication.feed.atom.Content;
import edu.smu.tspell.wordnet.impl.file.SenseKey;
import edu.stanford.nlp.ling.CoreLabel;
import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.http.HttpHost;
import org.json.JSONArray;
import org.json.JSONException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import ws.palladian.extraction.date.PageDateType;
import ws.palladian.extraction.date.WebPageDateEvaluator;
import ws.palladian.extraction.multimedia.ImageHandler;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.date.ExtractedDate;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.PageAnalyzer;
import ws.palladian.retrieval.XPathSet;
import ws.palladian.retrieval.helper.JsonObjectWrapper;
import ws.palladian.retrieval.resources.WebImage;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/content/PalladianContentExtractor.class */
public class PalladianContentExtractor extends WebPageContentExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(ReadabilityContentExtractor.class);
    private static final List<String> MAIN_NODE_HINTS = new ArrayList();
    private Document document;
    private Node resultNode;
    private List<String> sentences = new ArrayList();
    private List<String> comments = new ArrayList();
    private String mainContentHtml = "";
    private String mainContentText = "";
    private String fullTextContent = "";
    private static final int DEFAULT_IMAGE_CONTAINER_SIZE = 500;
    private List<WebImage> imageURLs;

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public PalladianContentExtractor setDocument(Document document) throws PageContentExtractorException {
        this.document = document;
        this.imageURLs = null;
        this.resultNode = null;
        this.sentences = new ArrayList();
        this.comments = new ArrayList();
        this.mainContentHtml = "";
        this.mainContentText = "";
        this.fullTextContent = "";
        parseDocument();
        return this;
    }

    public Document getDocument() {
        return this.document;
    }

    public List<String> getSentences() {
        return this.sentences;
    }

    public List<String> getComments() {
        return this.comments;
    }

    private String cleanXPath(String str) {
        String replace = str.replaceAll("/text(\\[.*?\\])?", CoreLabel.TAG_SEPARATOR).replace("html/body", "").replace("xhtml:html/xhtml:body", "").replace("///", "//");
        if (replace.isEmpty() || replace.equals("//")) {
            replace = "//body";
        }
        if (replace.endsWith("//")) {
            replace = replace.substring(0, replace.length() - 2);
        }
        return replace;
    }

    public String getEntireTextContent() {
        this.fullTextContent = this.fullTextContent.replaceAll("(\t)+", "");
        this.fullTextContent = Pattern.compile("^.{0,40}$", 8).matcher(this.fullTextContent).replaceAll(FileHelper.NEWLINE_CHARACTER);
        this.fullTextContent = this.fullTextContent.replaceAll("\n(\\s)+\n", "\n\n");
        this.fullTextContent = this.fullTextContent.replaceAll("(\n){2,}", "\n\n");
        return this.fullTextContent;
    }

    private void parseDocument() throws PageContentExtractorException {
        boolean z = false;
        String str = "";
        String str2 = "";
        this.resultNode = getMainContentNodeWithHints();
        int i = 0;
        if (this.resultNode != null) {
            str2 = XPathHelper.addXhtmlNsToXPath(getDocument(), PageAnalyzer.constructXPath(this.resultNode));
            str = str2;
            i = countDirectTextNodes();
            LOGGER.debug("direct text nodes: " + i);
        }
        this.fullTextContent = HtmlHelper.documentToText(this.document);
        cleanDom();
        this.sentences = Tokenizer.getSentences(HtmlHelper.documentToText(this.document), true);
        XPathSet xPathSet = new XPathSet();
        Iterator it = new HashSet(this.sentences).iterator();
        while (it.hasNext()) {
            Iterator<String> it2 = PageAnalyzer.constructAllXPaths(getDocument(), (String) it.next()).iterator();
            while (it2.hasNext()) {
                xPathSet.add(PageAnalyzer.removeXPathIndicesFromLastCountNode(it2.next()));
            }
        }
        LinkedHashMap<String, Integer> xPathMap = xPathSet.getXPathMap();
        String highestCountXPath = xPathSet.getHighestCountXPath();
        int countOfXPath = xPathSet.getCountOfXPath(highestCountXPath);
        HashSet newHashSet = CollectionHelper.newHashSet();
        if (!str2.isEmpty()) {
            for (Map.Entry<String, Integer> entry : xPathMap.entrySet()) {
                if (!entry.getKey().startsWith(str2)) {
                    newHashSet.add(entry.getKey());
                }
            }
            Iterator it3 = newHashSet.iterator();
            while (it3.hasNext()) {
                xPathSet.remove((String) it3.next());
            }
            if (xPathSet.isEmpty()) {
                z = true;
            } else {
                highestCountXPath = xPathSet.getHighestCountXPath();
                countOfXPath = xPathSet.getCountOfXPath(highestCountXPath);
                if (i > 3) {
                    z = true;
                }
            }
        }
        String str3 = highestCountXPath;
        if (z) {
            str = str2;
        } else {
            for (Map.Entry<String, Integer> entry2 : xPathMap.entrySet()) {
                if (entry2.getKey().length() < str3.length() && entry2.getValue().intValue() == countOfXPath) {
                    str3 = entry2.getKey();
                }
            }
        }
        if (str3.isEmpty()) {
            z = true;
        }
        String findLastBoxSection = PageAnalyzer.findLastBoxSection(str3);
        if (!z) {
            str = XPathHelper.getParentXPath(findLastBoxSection);
        }
        String cleanXPath = cleanXPath(str);
        this.resultNode = XPathHelper.getXhtmlNode(getDocument(), cleanXPath);
        if (this.resultNode == null) {
            String replaceAll = cleanXPath.replaceAll("\\/[^x].*?\\:.*?\\/", "//");
            this.resultNode = XPathHelper.getXhtmlNode(getDocument(), replaceAll);
            if (this.resultNode == null) {
                this.resultNode = XPathHelper.getXhtmlNode(getDocument(), XPathHelper.addXhtmlNsToXPath(replaceAll));
                if (this.resultNode == null) {
                    this.mainContentText = this.fullTextContent;
                    return;
                }
            }
        }
        if (!z) {
            String addHeadlineSiblings = addHeadlineSiblings(findLastBoxSection);
            StringBuilder sb = new StringBuilder();
            Iterator<Node> it4 = XPathHelper.getXhtmlNodes(getDocument(), addHeadlineSiblings).iterator();
            while (it4.hasNext()) {
                String textContent = it4.next().getTextContent();
                if (!textContent.isEmpty()) {
                    sb.append(textContent).append("\n\n");
                }
            }
            this.mainContentText = sb.toString();
        }
        this.mainContentHtml = HtmlHelper.xmlToString(this.resultNode, true);
        if (this.mainContentText.trim().length() < 100) {
            this.mainContentText = HtmlHelper.documentToReadableText(this.resultNode);
        }
        if (this.mainContentText.trim().length() < 100) {
            this.mainContentText = this.fullTextContent;
        }
    }

    private int countDirectTextNodes() {
        int i = 0;
        Iterator<Node> it = XPathHelper.getXhtmlNodes(this.resultNode, "./text()").iterator();
        while (it.hasNext()) {
            String trim = it.next().getTextContent().trim();
            if (trim.length() > 20 && !trim.startsWith("<!--")) {
                i++;
            }
        }
        return i;
    }

    private void cleanDom() {
        Node parentNode;
        removeCommentNodes();
        for (Node node : XPathHelper.getXhtmlNodes(this.document, "//*[(self::xhtml:style) or (self::xhtml:script) or (self::xhtml:iframe)]")) {
            if (node != null && (parentNode = node.getParentNode()) != null) {
                parentNode.removeChild(node);
            }
        }
    }

    private void removeCommentNodes() {
        for (Node node : XPathHelper.getXhtmlNodes(this.document, "//*[(self::xhtml:div) or (self::xhtml:p) or (self::xhtml:section)][@class='comment' or contains(@class,'comments ') or contains(@class,' comments') or contains(@id,'comments') or @id='disqus_thread']")) {
            this.comments.add(HtmlHelper.documentToReadableText(node));
            node.getParentNode().removeChild(node);
        }
    }

    private Node getMainContentNodeWithHints() {
        Node node = null;
        Iterator<String> it = MAIN_NODE_HINTS.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            String next = it.next();
            List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(getDocument(), "//*[(self::xhtml:div) or (self::xhtml:p) or (self::xhtml:span)][@class='" + next + "' or contains(@class,'" + next + " ') or contains(@class,' " + next + "') or @itemprop='" + next + "' or @id='" + next + "']");
            if (!xhtmlNodes.isEmpty()) {
                node = xhtmlNodes.get(0);
                if (xhtmlNodes.size() > 1) {
                    node = node.getParentNode();
                }
            }
            if (node != null) {
                LOGGER.debug("found main node with hint: " + next);
                break;
            }
        }
        return node;
    }

    private String addHeadlineSiblings(String str) {
        try {
            String[] split = str.split(CoreLabel.TAG_SEPARATOR);
            String str2 = split[split.length - 1];
            String str3 = str2.contains(Content.XHTML) ? "xhtml:" : "";
            str = str.replaceAll(str2 + "$", "*[(self::" + str2 + ") or (self::" + str3 + "h1) or (self::" + str3 + "h2) or (self::" + str3 + "h3) or (self::" + str3 + "h4) or (self::" + str3 + "h5) or (self::" + str3 + "h6) or (self::" + str3 + "span) or (self::" + str3 + "ul) or (self::" + str3 + "ol) or (self::" + str3 + "blockquote)]");
        } catch (Exception e) {
        }
        return str;
    }

    public List<WebImage> getImages(String str) {
        ArrayList arrayList = new ArrayList();
        String lowerCase = str.toLowerCase();
        for (WebImage webImage : getImages()) {
            if (webImage.getType().toLowerCase().equalsIgnoreCase(lowerCase)) {
                arrayList.add(webImage);
            }
        }
        return arrayList;
    }

    public List<WebImage> getImages() {
        return getImages(this.resultNode);
    }

    /* JADX WARN: Multi-variable type inference failed */
    public List<WebImage> getImages(Node node) {
        if (this.imageURLs != null) {
            return this.imageURLs;
        }
        this.imageURLs = new ArrayList();
        if (this.resultNode == null) {
            return this.imageURLs;
        }
        List<Node> newArrayList = CollectionHelper.newArrayList();
        while (newArrayList.isEmpty() && node != null) {
            newArrayList = XPathHelper.getXhtmlNodes(node, ".//xhtml:img");
            node = node.getParentNode();
        }
        for (Node node2 : newArrayList) {
            try {
                WebImage webImage = new WebImage();
                NamedNodeMap attributes = node2.getAttributes();
                String textContent = attributes.getNamedItem(XHtmlWriter.SRC).getTextContent();
                if (!textContent.startsWith(HttpHost.DEFAULT_SCHEME_NAME)) {
                    textContent = UrlHelper.makeFullUrl(getDocument().getDocumentURI(), null, textContent);
                }
                webImage.setUrl(textContent);
                if (attributes.getNamedItem(XHtmlWriter.ALT) != null) {
                    webImage.setAlt(attributes.getNamedItem(XHtmlWriter.ALT).getTextContent());
                }
                if (attributes.getNamedItem("title") != null) {
                    webImage.setTitle(attributes.getNamedItem("title").getTextContent());
                }
                if (attributes.getNamedItem(XHtmlWriter.WIDTH) != null) {
                    webImage.setWidth(getImageSize(attributes.getNamedItem(XHtmlWriter.WIDTH).getTextContent()));
                }
                if (attributes.getNamedItem(XHtmlWriter.HEIGHT) != null) {
                    webImage.setHeight(getImageSize(attributes.getNamedItem(XHtmlWriter.HEIGHT).getTextContent()));
                }
                this.imageURLs.add(webImage);
            } catch (NullPointerException e) {
                LOGGER.debug("an image has not all necessary attributes");
            } catch (NumberFormatException e2) {
                LOGGER.debug(e2.getMessage());
            }
        }
        return this.imageURLs;
    }

    private int getImageSize(String str) throws NumberFormatException {
        String replace = str.replace(",*", "");
        return replace.indexOf(SenseKey.LEMMA_TERMINATOR) > -1 ? (int) (0.01d * Integer.parseInt(StringHelper.trim(replace.replace(SenseKey.LEMMA_TERMINATOR, ""))) * 500.0d) : Integer.parseInt(StringHelper.trim(replace.replace("px", "")));
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public Node getResultNode() {
        return this.resultNode;
    }

    public String getMainContentHtml() {
        return this.mainContentHtml;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultText() {
        return this.mainContentText;
    }

    public String getSentencesString() {
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = getSentences().iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(Strings.SINGLE_SPACE_STRING);
        }
        return sb.toString();
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultTitle() {
        Node xhtmlNode = XPathHelper.getXhtmlNode(getDocument(), "//h1");
        String clean = xhtmlNode != null ? StringHelper.clean(xhtmlNode.getTextContent()) : "";
        if (clean.isEmpty()) {
            Node xhtmlNode2 = XPathHelper.getXhtmlNode(getDocument(), "//title");
            clean = xhtmlNode2 != null ? xhtmlNode2.getTextContent().replaceAll("\\|.*", "").trim() : StringHelper.getFirstWords(this.mainContentText, 20);
        }
        return clean;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getExtractorName() {
        return "Palladian";
    }

    public void analyzeImages() {
        for (WebImage webImage : getImages()) {
            if (webImage.getWidth() == 0 || webImage.getHeight() == 0) {
                BufferedImage load = ImageHandler.load(webImage.getUrl());
                if (load != null) {
                    webImage.setWidth(load.getWidth());
                    webImage.setHeight(load.getHeight());
                }
            }
        }
    }

    public String getAuthorName(String str) {
        String str2 = "";
        JSONArray jsonArray = new DocumentRetriever().getJsonArray("http://webknox.com/api/webpage/author?url=" + getDocument().getDocumentURI() + "&language=en&apiKey=" + str);
        if (jsonArray != null && jsonArray.length() > 0) {
            try {
                str2 = new JsonObjectWrapper(jsonArray.getJSONObject(0)).getString(XHtmlWriter.NAME);
            } catch (JSONException e) {
            }
        }
        return str2;
    }

    public ExtractedDate getPublishDate() {
        return WebPageDateEvaluator.getBestDate(this.document, PageDateType.PUBLISH);
    }

    public static void main(String[] strArr) throws PageContentExtractorException {
        PalladianContentExtractor palladianContentExtractor = new PalladianContentExtractor();
        palladianContentExtractor.setDocument("http://www.washingtonpost.com/politics/decision2012/after-grueling-campaign-polls-open-for-election-day-2012/2012/11/06/d1c24c98-2802-11e2-b4e0-346287b7e56c_story.html");
        System.out.println("Title: " + palladianContentExtractor.getResultTitle());
        System.out.println("Result Text: " + palladianContentExtractor.getResultText());
        System.out.println("Comments: ");
        CollectionHelper.print(palladianContentExtractor.getComments());
        System.out.println("Full Text: " + palladianContentExtractor.getEntireTextContent());
    }

    static {
        MAIN_NODE_HINTS.add("articleText");
        MAIN_NODE_HINTS.add("article_body");
        MAIN_NODE_HINTS.add("article-body");
        MAIN_NODE_HINTS.add("articleBody");
    }
}
