package ws.palladian.extraction;

import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import edu.stanford.nlp.ling.CoreLabel;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.JaroWinklerSimilarity;
import ws.palladian.helper.nlp.NGramSimilarity;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.PageAnalyzer;
import ws.palladian.retrieval.XPathSet;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/ListDiscoverer.class */
public class ListDiscoverer {
    private static final Logger LOGGER = LoggerFactory.getLogger(ListDiscoverer.class);
    private DocumentRetriever crawler;
    private String paginationXPath = "";
    private String url = "";
    private Document document = null;
    private Set<String> paginationURLs = new HashSet();

    public ListDiscoverer() {
        this.crawler = null;
        this.crawler = new DocumentRetriever();
    }

    public Set<String> findPaginationURLs(Document document) {
        if (document == null) {
            return this.paginationURLs;
        }
        this.document = document;
        this.url = document.getDocumentURI();
        return findPaginationURLs();
    }

    public Set<String> findPaginationURLs(String str) {
        if (this.document == null || !this.url.equalsIgnoreCase(str)) {
            this.document = this.crawler.getWebDocument(str);
            this.url = str;
            this.paginationXPath = "";
        }
        return findPaginationURLs();
    }

    public Set<String> findPaginationURLs() {
        if (this.paginationXPath.length() == 0) {
            this.paginationURLs = new HashSet();
            XPathSet xPathSet = new XPathSet();
            String[] strArr = {"a", XHtmlWriter.TR, XHtmlWriter.TD, "p", "span", XHtmlWriter.LI};
            List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(this.document, "//a");
            if (xhtmlNodes == null) {
                return this.paginationURLs;
            }
            for (int i = 0; i < xhtmlNodes.size(); i++) {
                Node node = xhtmlNodes.get(i);
                String replaceAll = StringHelper.trim(node.getTextContent()).replaceAll("\\[", "").replaceAll("\\]", "");
                if (replaceAll.length() > 0 && ((replaceAll.length() <= 3 && StringHelper.isNumber(replaceAll)) || ((replaceAll.length() == 1 && StringHelper.isCompletelyUppercase(replaceAll)) || (replaceAll.toLowerCase().indexOf("next") > -1 && replaceAll.length() < 8)))) {
                    xPathSet.add(PageAnalyzer.removeXPathIndices(PageAnalyzer.constructXPath(node), strArr));
                }
            }
            LinkedHashMap<String, Integer> xPathMap = xPathSet.getXPathMap();
            if (xPathMap.entrySet().size() > 0) {
                LinkedHashMap linkedHashMap = new LinkedHashMap();
                JaroWinklerSimilarity jaroWinklerSimilarity = new JaroWinklerSimilarity();
                for (Map.Entry<String, Integer> entry : xPathMap.entrySet()) {
                    double d = 0.0d;
                    int i2 = 0;
                    List<Node> xhtmlNodes2 = XPathHelper.getXhtmlNodes(this.document, entry.getKey() + "/@href");
                    int i3 = 0;
                    HashSet hashSet = new HashSet();
                    for (int i4 = 0; i4 < xhtmlNodes2.size(); i4++) {
                        String replaceAll2 = xhtmlNodes2.get(i4).getTextContent().replaceAll("#.*", "");
                        if (replaceAll2.length() == 0) {
                            i3++;
                        } else {
                            hashSet.add(replaceAll2);
                        }
                    }
                    if (i3 / xhtmlNodes2.size() > 0.5d) {
                        this.paginationXPath = "";
                        return this.paginationURLs;
                    }
                    if (hashSet.size() >= 2) {
                        String str = "";
                        int i5 = 0;
                        Iterator it = hashSet.iterator();
                        while (it.hasNext()) {
                            String str2 = (String) it.next();
                            if (i5 % 2 == 0) {
                                str = str2;
                            } else {
                                d += jaroWinklerSimilarity.getSimilarity(str, str2);
                                i2++;
                            }
                            i5++;
                        }
                        double d2 = d / i2;
                        if (d2 > 0.8d) {
                            linkedHashMap.put(entry.getKey(), Double.valueOf(d2));
                        }
                    }
                }
                LinkedHashMap sortByValue = CollectionHelper.sortByValue(linkedHashMap, false);
                if (sortByValue.isEmpty()) {
                    this.paginationXPath = xPathSet.getHighestCountXPath();
                } else {
                    this.paginationXPath = (String) ((Map.Entry) sortByValue.entrySet().iterator().next()).getKey();
                }
                if (xPathSet.getCountOfXPath(this.paginationXPath) == 1) {
                    String trim = StringHelper.trim(PageAnalyzer.getTextByXPath(this.document, this.paginationXPath));
                    if (trim.toLowerCase().indexOf("next") == -1 && !trim.equals(SchemaSymbols.ATTVAL_TRUE_1)) {
                        this.paginationXPath = "";
                        return this.paginationURLs;
                    }
                }
            } else {
                this.paginationXPath = xPathSet.getHighestCountXPath(3);
            }
            if (this.paginationXPath.length() == 0) {
                return this.paginationURLs;
            }
            TreeSet treeSet = new TreeSet();
            List<Node> xhtmlNodes3 = XPathHelper.getXhtmlNodes(this.document, this.paginationXPath);
            for (int i6 = 0; i6 < xhtmlNodes3.size(); i6++) {
                String replaceAll3 = StringHelper.trim(xhtmlNodes3.get(i6).getTextContent()).replaceAll("\\[", "").replaceAll("\\]", "");
                if (StringHelper.isNumber(replaceAll3)) {
                    try {
                        treeSet.add(Integer.valueOf(replaceAll3));
                    } catch (NumberFormatException e) {
                        LOGGER.error(replaceAll3 + "," + e.getMessage());
                    }
                }
            }
            if (treeSet.size() > 0 && treeSet.size() < 2) {
                this.paginationXPath = "";
                return this.paginationURLs;
            }
            int i7 = 0;
            int i8 = 0;
            int i9 = -1;
            Iterator it2 = treeSet.iterator();
            while (it2.hasNext()) {
                int intValue = ((Integer) it2.next()).intValue();
                if (i9 > -1) {
                    if (intValue == i9 + 1) {
                        i8++;
                        if (i8 > i7) {
                            i7 = i8;
                        }
                    } else {
                        i8 = 0;
                    }
                }
                i9 = intValue;
            }
            if (i7 < treeSet.size() / 2) {
                this.paginationXPath = "";
                return this.paginationURLs;
            }
            if (this.paginationXPath.length() > 0) {
                this.paginationXPath += "/@href";
            }
        }
        if (this.paginationXPath.length() == 0) {
            return this.paginationURLs;
        }
        this.paginationXPath = removeHtmlBody(this.paginationXPath);
        List<Node> xhtmlNodes4 = XPathHelper.getXhtmlNodes(this.document, this.paginationXPath);
        for (int i10 = 0; i10 < xhtmlNodes4.size(); i10++) {
            String makeFullUrl = UrlHelper.makeFullUrl(this.url, xhtmlNodes4.get(i10).getTextContent());
            if (makeFullUrl.length() > 0) {
                this.paginationURLs.add(makeFullUrl);
            }
        }
        filterPaginationUrls();
        return this.paginationURLs;
    }

    private void filterPaginationUrls() {
        CountMap create = CountMap.create();
        Iterator<String> it = this.paginationURLs.iterator();
        while (it.hasNext()) {
            create.add(Integer.valueOf(it.next().length()));
        }
        if (create.uniqueSize() == 0) {
            return;
        }
        int intValue = ((Integer) ((Map.Entry) create.getSortedMapDescending().entrySet().iterator().next()).getKey()).intValue();
        HashSet hashSet = new HashSet();
        for (String str : this.paginationURLs) {
            if (MathHelper.isWithinRange(str.length(), intValue, 1.0d)) {
                hashSet.add(str);
            }
        }
        this.paginationURLs = hashSet;
    }

    public static String removeHtmlBody(String str) {
        return str.replace("/html/body/", "//").replace("/xhtml:html/xhtml:body/", "//");
    }

    public Set<String> getPaginationURLs() {
        return this.paginationURLs;
    }

    public XPathSet getXPathSet(Document document) {
        XPathSet xPathSet = new XPathSet();
        for (String str : new String[]{"//ul/li", "//ol/li", "//td", "//h2", "//h3", "//h4", "//h5", "//h6", "//a", "//i", "//div", "//strong", "//span"}) {
            List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, str);
            if (xhtmlNodes != null) {
                for (int i = 0; i < xhtmlNodes.size(); i++) {
                    xPathSet.add(PageAnalyzer.removeXPathIndicesNot(PageAnalyzer.constructXPath(xhtmlNodes.get(i)), new String[]{"table"}));
                }
            }
        }
        return xPathSet;
    }

    public String discoverEntityXPath(String str) {
        DocumentRetriever documentRetriever = new DocumentRetriever();
        this.url = str;
        this.document = documentRetriever.getWebDocument(str);
        return discoverEntityXPath(this.document);
    }

    public String discoverEntityXPath(Document document) {
        this.url = document.getDocumentURI();
        this.document = document;
        XPathSet removeSiblingPagePaths = removeSiblingPagePaths(getXPathSet(document), this.url, document);
        if (removeSiblingPagePaths.getXPathMap().size() == 0) {
            return "";
        }
        removeSiblingPagePaths.getHighestCountXPath();
        String removeHtmlBody = removeHtmlBody(removeSiblingPagePaths.getLongestHighCountXPath(document));
        if (PageAnalyzer.nodeInTable(removeHtmlBody, 6)) {
            int findEntityColumn = findEntityColumn(document, removeHtmlBody);
            if (findEntityColumn == -1) {
                return "";
            }
            removeHtmlBody = setIndex(removeHtmlBody, XHtmlWriter.TD, findEntityColumn);
        }
        List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, removeHtmlBody);
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < xhtmlNodes.size(); i++) {
            arrayList.add(xhtmlNodes.get(i).getTextContent());
        }
        return (entriesUniform(arrayList, false) && arrayList.size() >= 10) ? removeHtmlBody : "";
    }

    public XPathSet removeSiblingPagePaths(XPathSet xPathSet, String str, Document document) {
        Document webDocument;
        XPathSet xPathSet2 = new XPathSet();
        String siblingPage = PageAnalyzer.getSiblingPage(document);
        if (siblingPage.length() != 0 && (webDocument = this.crawler.getWebDocument(siblingPage)) != null) {
            LinkedHashMap<String, Integer> xPathMap = getXPathSet(webDocument).getXPathMap();
            int i = 0;
            for (Map.Entry<String, Integer> entry : xPathSet.getXPathMap().entrySet()) {
                if (xPathMap.containsKey(entry.getKey())) {
                    String textByXPath = PageAnalyzer.getTextByXPath(document, entry.getKey());
                    String substring = textByXPath.substring(0, Math.min(200, textByXPath.length()));
                    String textByXPath2 = PageAnalyzer.getTextByXPath(webDocument, entry.getKey());
                    double similarity = new NGramSimilarity(3).getSimilarity(substring, textByXPath2.substring(0, Math.min(200, textByXPath2.length())));
                    if (similarity < 0.7d) {
                        xPathSet2.addEntry(entry);
                    } else if (similarity > 0.98d) {
                        i++;
                    }
                } else {
                    xPathSet2.addEntry(entry);
                }
            }
            if (i / xPathSet.getXPathMap().entrySet().size() < 0.9d) {
                return xPathSet2;
            }
            LOGGER.info("sibling url was probably the same as source url");
            return xPathSet;
        }
        return xPathSet;
    }

    public int findEntityColumn(Document document, String str) {
        int numberOfTableColumns = PageAnalyzer.getNumberOfTableColumns(document, str);
        ArrayList arrayList = new ArrayList();
        for (int i = 1; i <= numberOfTableColumns; i++) {
            ArrayList arrayList2 = new ArrayList();
            List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, setIndex(str, XHtmlWriter.TD, i));
            List<Node> xhtmlNodes2 = XPathHelper.getXhtmlNodes(document, PageAnalyzer.getTableCellPath(setIndex(str, XHtmlWriter.TD, i)));
            for (int i2 = 0; i2 < xhtmlNodes.size(); i2++) {
                arrayList2.add(xhtmlNodes.get(i2).getTextContent());
            }
            if ((!entriesUniform(arrayList2, true) || arrayList2.size() <= 0) && xhtmlNodes2.size() > 1) {
                LOGGER.info("Column " + i + CoreLabel.TAG_SEPARATOR + numberOfTableColumns + " is not uniform");
            } else {
                arrayList.add(Integer.valueOf(i));
                LOGGER.info("Column " + i + CoreLabel.TAG_SEPARATOR + numberOfTableColumns + " is uniform");
            }
        }
        if (arrayList.size() == 0) {
            LOGGER.info("No uniform columns found");
            return -1;
        }
        if (arrayList.size() != numberOfTableColumns) {
            return ((Integer) arrayList.get(0)).intValue();
        }
        LOGGER.info("All columns are uniform");
        return 0;
    }

    private String setIndex(String str, String str2, int i) {
        String str3;
        int lastIndexOf = str.lastIndexOf(str2);
        if (lastIndexOf == -1) {
            return str;
        }
        String substring = str.substring(0, lastIndexOf);
        if (substring.matches(str2 + "\\[")) {
            str3 = i == 0 ? substring + str.substring(lastIndexOf).replaceAll(str2 + "\\[(\\d)+\\]", str2) : substring + str.substring(lastIndexOf).replaceAll(str2 + "\\[(\\d)+\\]", str2 + "[" + i + DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END);
        } else {
            if (i <= 0) {
                return str;
            }
            str3 = substring + str.substring(lastIndexOf).replaceAll(str2, str2 + "[" + i + DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END);
        }
        return str3;
    }

    public static boolean entriesUniform(List<String> list, boolean z) {
        int size = list.size();
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        int i5 = 0;
        int i6 = 0;
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String trim = StringHelper.trim(it.next());
            i3 += trim.split(Strings.SINGLE_SPACE_STRING).length;
            if (trim.length() <= 200) {
                try {
                    if (StringHelper.isNumericExpression(trim) || StringHelper.isTimeExpression(trim)) {
                        i++;
                    }
                } catch (NumberFormatException e) {
                    LOGGER.error(trim, (Throwable) e);
                } catch (OutOfMemoryError e2) {
                    LOGGER.error(trim, (Throwable) e2);
                }
                if (StringHelper.isCompletelyUppercase(trim)) {
                    i2++;
                }
                if (trim.length() == 0) {
                    i4++;
                } else if (!hashSet.add(trim)) {
                    i5++;
                    if (hashSet2.add(trim)) {
                        i6++;
                    }
                }
            }
        }
        if (i / size > 0.15d) {
            LOGGER.info("entries not uniform because too many numeric entries");
            return false;
        }
        if (i2 / size > 0.5d) {
            LOGGER.info("entries not uniform because too many entirely capitalized entries");
            return false;
        }
        if (i3 / size > 12.0d) {
            LOGGER.info("entries not uniform because average word length too long");
            return false;
        }
        if (z && i5 / size > 0.1d) {
            LOGGER.info("entries not uniform because too many duplicates");
            return false;
        }
        if (z || i6 / hashSet.size() <= 0.6d) {
            return true;
        }
        LOGGER.info("entries not uniform because too many duplicate words");
        return false;
    }

    public String getPaginationXPath() {
        return this.paginationXPath;
    }

    public void setPaginationXPath(String str) {
        this.paginationXPath = str;
    }

    public void setDocument(Document document) {
        this.document = document;
    }

    public void setUrl(String str) {
        this.url = str;
    }

    public static void main(String[] strArr) {
        ListDiscoverer listDiscoverer = new ListDiscoverer();
        String discoverEntityXPath = listDiscoverer.discoverEntityXPath("http://en.wikipedia.org/wiki/List_of_countries_by_population");
        if (discoverEntityXPath.length() == 0) {
            System.out.println("no path found");
        } else {
            System.out.println("path: " + discoverEntityXPath.toLowerCase());
            System.out.println("path: " + discoverEntityXPath);
        }
        Document webDocument = new DocumentRetriever().getWebDocument("http://en.wikipedia.org/wiki/List_of_countries_by_population");
        System.out.println(PageAnalyzer.getTextByXPath(webDocument, discoverEntityXPath));
        Iterator<Node> it = XPathHelper.getXhtmlNodes(webDocument, discoverEntityXPath).iterator();
        while (it.hasNext()) {
            System.out.println("kbEntities.put(\"" + StringHelper.trim(it.next().getTextContent()) + "\",ct1);");
        }
        listDiscoverer.findPaginationURLs("http://en.wikipedia.org/wiki/List_of_countries_by_population");
        System.out.println(PageAnalyzer.getTextByXPath(webDocument, listDiscoverer.getPaginationXPath().replaceAll("/@href", "")));
        System.out.println("pagination xpath: " + listDiscoverer.getPaginationXPath().toLowerCase());
    }
}
