package ws.palladian.retrieval;

import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.lang3.Validate;
import org.apache.log4j.spi.LocationInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/PageAnalyzer.class */
public final class PageAnalyzer {
    public static final Logger LOGGER = LoggerFactory.getLogger(PageAnalyzer.class);

    private PageAnalyzer() {
    }

    public static String getTitle(Document document) {
        Node xhtmlNode = XPathHelper.getXhtmlNode(document, "//title");
        return xhtmlNode != null ? xhtmlNode.getTextContent() : "#error#";
    }

    public static String[] detectFactTable(Document document) {
        String[] strArr = {"", "", ""};
        XPathSet xPathSet = getXPathSet(document);
        strArr[0] = xPathSet.getHighestCountXPath(4);
        strArr[2] = String.valueOf((int) Math.ceil(xPathSet.getCountOfXPath(strArr[0]) / 2.0d));
        if (strArr[0].length() > 0 && xPathSet.getCountOfXPath(strArr[0].substring(0, strArr[0].length() - 1) + "h") == xPathSet.getCountOfXPath(strArr[0])) {
            strArr[0] = strArr[0].substring(0, strArr[0].length() - 1) + "h";
            strArr[2] = String.valueOf(xPathSet.getCountOfXPath(strArr[0]));
        }
        strArr[1] = "0";
        return strArr;
    }

    private static XPathSet getXPathSet(Document document) {
        XPathSet xPathSet = new XPathSet();
        for (String str : new String[]{"//td", "//th"}) {
            List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, str);
            if (xhtmlNodes != null) {
                Iterator<Node> it = xhtmlNodes.iterator();
                while (it.hasNext()) {
                    xPathSet.add(removeXPathIndicesNot(constructXPath(it.next()), new String[]{"table"}));
                }
            }
        }
        return xPathSet;
    }

    public static LinkedHashSet<String> constructAllXPaths(Document document, String str) {
        return constructAllXPaths(document, str, false, false);
    }

    public static LinkedHashSet<String> constructAllXPaths(Document document, String str, boolean z, boolean z2) {
        LinkedHashSet<String> linkedHashSet = new LinkedHashSet<>();
        if (document == null) {
            LOGGER.warn("document was null when constructing xpaths");
            return linkedHashSet;
        }
        try {
            linkedHashSet = visit(document.getLastChild(), str, z2, linkedHashSet);
        } catch (Exception e) {
            LOGGER.error(document.getDocumentURI(), (Throwable) e);
        } catch (StackOverflowError e2) {
            LOGGER.error(document.getDocumentURI(), (Throwable) e2);
        }
        LinkedHashSet<String> linkedHashSet2 = new LinkedHashSet<>();
        Iterator<String> it = linkedHashSet.iterator();
        while (it.hasNext()) {
            String addXhtmlNsToXPath = XPathHelper.addXhtmlNsToXPath(document, it.next());
            if (z) {
                addXhtmlNsToXPath = removeXPathIndices(addXhtmlNsToXPath);
            }
            linkedHashSet2.add(addXhtmlNsToXPath);
        }
        String str2 = "";
        Iterator<String> it2 = linkedHashSet2.iterator();
        while (it2.hasNext()) {
            String next = it2.next();
            if (next.length() > str2.length()) {
                str2 = next;
            }
        }
        HashSet newHashSet = CollectionHelper.newHashSet();
        Iterator<String> it3 = linkedHashSet2.iterator();
        while (it3.hasNext()) {
            String next2 = it3.next();
            if (str2.length() > next2.length() && str2.startsWith(next2)) {
                newHashSet.add(next2);
            }
        }
        linkedHashSet2.removeAll(newHashSet);
        return linkedHashSet2;
    }

    public static LinkedHashSet<String> keepXPathPointingTo(LinkedHashSet<String> linkedHashSet, String[] strArr) {
        LinkedHashSet<String> linkedHashSet2 = new LinkedHashSet<>();
        HashSet hashSet = new HashSet();
        for (String str : strArr) {
            hashSet.add(str.toLowerCase());
        }
        Iterator<String> it = linkedHashSet.iterator();
        while (it.hasNext()) {
            String next = it.next();
            String[] split = removeXPathIndices(next).split(CoreLabel.TAG_SEPARATOR);
            if (hashSet.contains(split[split.length - 1].toLowerCase().replaceAll("xhtml:", ""))) {
                linkedHashSet2.add(next);
            }
        }
        return linkedHashSet2;
    }

    public static String makeMutualXPath(Set<String> set) {
        int indexOf;
        if (set.isEmpty()) {
            return "";
        }
        XPathSet xPathSet = new XPathSet();
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            xPathSet.add(removeXPathIndices(it.next()));
        }
        String[] split = xPathSet.getHighestCountXPath().split(CoreLabel.TAG_SEPARATOR);
        String str = "";
        Iterator<String> it2 = set.iterator();
        while (true) {
            if (!it2.hasNext()) {
                break;
            }
            String next = it2.next();
            boolean z = true;
            String[] split2 = removeXPathIndices(next).split(CoreLabel.TAG_SEPARATOR);
            int i = 0;
            while (true) {
                if (i >= Math.min(split2.length, split.length)) {
                    break;
                }
                if (!split2[i].equals(split[i])) {
                    z = false;
                    break;
                }
                i++;
            }
            if (z) {
                str = next;
                break;
            }
        }
        String[] split3 = str.split(CoreLabel.TAG_SEPARATOR);
        Integer[] numArr = new Integer[split3.length];
        for (int i2 = 0; i2 < numArr.length; i2++) {
            numArr[i2] = 1;
        }
        Iterator<String> it3 = set.iterator();
        while (it3.hasNext()) {
            String[] split4 = it3.next().split(CoreLabel.TAG_SEPARATOR);
            for (int i3 = 0; i3 < Math.min(split3.length, split4.length); i3++) {
                int indexOf2 = split3[i3].indexOf("[");
                if (indexOf2 != -1 && (indexOf = split4[i3].indexOf("[")) != -1) {
                    int intValue = Integer.valueOf(split3[i3].substring(indexOf2 + 1, split3[i3].length() - 1)).intValue();
                    int intValue2 = Integer.valueOf(split4[i3].substring(indexOf + 1, split4[i3].length() - 1)).intValue();
                    if (split3[i3].substring(0, indexOf2).equals(split4[i3].substring(0, indexOf)) && intValue != intValue2) {
                        numArr[i3] = 0;
                    }
                }
            }
        }
        for (int i4 = 0; i4 < split3.length; i4++) {
            int indexOf3 = split3[i4].indexOf("[");
            if (indexOf3 != -1 && numArr[i4].intValue() != 1) {
                split3[i4] = split3[i4].substring(0, indexOf3);
            }
        }
        String str2 = "";
        for (String str3 : split3) {
            str2 = str2 + str3 + CoreLabel.TAG_SEPARATOR;
        }
        return str2.substring(0, str2.length() - 1);
    }

    private static LinkedHashSet<String> visit(Node node, String str, boolean z, LinkedHashSet<String> linkedHashSet) {
        try {
            for (Node firstChild = node.getFirstChild(); firstChild != null; firstChild = firstChild.getNextSibling()) {
                String nodeValue = firstChild.getNodeValue();
                if (firstChild.getTextContent().contains(str) || (nodeValue != null && firstChild.getNodeType() != 8 && nodeValue.toLowerCase().indexOf(str.toLowerCase()) > -1)) {
                    if (!z || nodeValue == null) {
                        String constructXPath = constructXPath(firstChild);
                        if (constructXPath.length() > 0) {
                            linkedHashSet.add(constructXPath);
                        }
                    } else if (Pattern.compile("(?<![A-Za-z_])" + Pattern.quote(str) + "(?![A-Za-z_])", 2).matcher(nodeValue).find()) {
                        String constructXPath2 = constructXPath(firstChild);
                        if (constructXPath2.length() > 0) {
                            linkedHashSet.add(constructXPath2);
                        }
                    }
                }
                linkedHashSet = visit(firstChild, str, z, linkedHashSet);
            }
        } catch (Exception e) {
            LOGGER.error(e.getMessage());
        }
        return linkedHashSet;
    }

    public static String constructXPath(Node node) {
        String str = "";
        do {
            int i = 0;
            String nodeName = node.getNodeName();
            Node previousSibling = node.getPreviousSibling();
            while (true) {
                Node node2 = previousSibling;
                if (node2 == null) {
                    break;
                }
                if (node2.getNodeName().equalsIgnoreCase(nodeName)) {
                    i++;
                }
                previousSibling = node2.getPreviousSibling();
            }
            int i2 = i + 1;
            String nodeName2 = node.getNodeName();
            if ((node.getNextSibling() != null || i2 > 1) && !node.getNodeName().equalsIgnoreCase("html") && !node.getNodeName().equalsIgnoreCase(XHtmlWriter.TH)) {
                nodeName2 = node.getNodeName() + "[" + i2 + DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END;
            }
            str = nodeName2 + CoreLabel.TAG_SEPARATOR + str;
            int indexOf = str.indexOf("/#text");
            if (indexOf > -1) {
                str = str.substring(0, indexOf);
            }
            node = node.getParentNode();
        } while (node != null);
        String substring = str.substring(9, str.length());
        if (substring.toLowerCase().indexOf("/script") > -1 || substring.toLowerCase().indexOf("/html:script") > -1) {
            return "";
        }
        if (substring.endsWith(CoreLabel.TAG_SEPARATOR)) {
            substring = substring.substring(0, substring.length() - 1);
        }
        return substring;
    }

    public static String constructIdClassXPath(Node node) {
        Validate.notNull(node, "node must not be null", new Object[0]);
        StringBuilder sb = new StringBuilder();
        while (node != null) {
            StringBuilder sb2 = new StringBuilder();
            String nodeName = node.getNodeName();
            if (nodeName.equals("#document")) {
                break;
            }
            sb2.append(nodeName);
            sb2.append(createIdClassString(node));
            sb.append(StringHelper.reverseString(sb2.toString())).append('/');
            node = node.getParentNode();
        }
        return StringHelper.reverseString(sb.toString());
    }

    public static String createIdClassString(Node node) {
        Validate.notNull(node, "node must not be null", new Object[0]);
        StringBuilder sb = new StringBuilder();
        if (node.getAttributes() != null) {
            Node namedItem = node.getAttributes().getNamedItem(XHtmlWriter.ID);
            if (namedItem != null) {
                for (String str : namedItem.getNodeValue().trim().split("\\s+")) {
                    if (!str.isEmpty()) {
                        sb.append('#').append(str);
                    }
                }
            }
            Node namedItem2 = node.getAttributes().getNamedItem(XHtmlWriter.CLASS);
            if (namedItem2 != null) {
                for (String str2 : namedItem2.getNodeValue().trim().split("\\s+")) {
                    if (!str2.isEmpty()) {
                        sb.append('.').append(str2);
                    }
                }
            }
        }
        return sb.toString();
    }

    public static boolean nodeInTable(String str, int i) {
        boolean z = false;
        String[] split = str.split(CoreLabel.TAG_SEPARATOR);
        int length = split.length;
        for (int i2 = length - 1; i2 > Math.max(0, (length - i) - 1); i2--) {
            if (split[i2].toLowerCase().indexOf(XHtmlWriter.TD) == 0 || split[i2].toLowerCase().indexOf("xhtml:td") == 0 || split[i2].toLowerCase().indexOf(XHtmlWriter.TH) == 0 || split[i2].toLowerCase().indexOf("xhtml:th") == 0) {
                z = true;
                break;
            }
        }
        return z;
    }

    public static String getTableCellPath(String str) {
        String[] split = str.split(CoreLabel.TAG_SEPARATOR);
        int length = split.length;
        for (int length2 = split.length - 1; length2 > 0; length2--) {
            if (split[length2].toLowerCase().indexOf(XHtmlWriter.TD) == 0 || split[length2].toLowerCase().indexOf("xhtml:td") == 0 || split[length2].toLowerCase().indexOf(XHtmlWriter.TH) == 0 || split[length2].toLowerCase().indexOf("xhtml:th") == 0) {
                length = length2 + 1;
                break;
            }
        }
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i < length; i++) {
            sb.append(CoreLabel.TAG_SEPARATOR).append(split[i]);
        }
        return sb.toString();
    }

    public static String getTargetNode(String str) {
        return str.lastIndexOf(CoreLabel.TAG_SEPARATOR) > -1 ? str.substring(str.lastIndexOf(CoreLabel.TAG_SEPARATOR) + 1).toLowerCase().replace("xhtml:", "").replaceAll("\\[(\\d)+\\]", "") : "";
    }

    public static boolean nodeInBox(String str, int i) {
        boolean z = false;
        String[] split = str.split(CoreLabel.TAG_SEPARATOR);
        int length = split.length;
        for (int i2 = length - 1; i2 > Math.max(0, (length - i) - 1); i2--) {
            if (split[i2].toLowerCase().indexOf("p") == 0 || split[i2].toLowerCase().indexOf("xhtml:p") == 0 || split[i2].toLowerCase().indexOf(XHtmlWriter.DIV) == 0 || split[i2].toLowerCase().indexOf("xhtml:div") == 0) {
                z = true;
                break;
            }
        }
        return z;
    }

    public static String findLastBoxSection(String str) {
        String[] split = str.split(CoreLabel.TAG_SEPARATOR);
        int length = split.length;
        for (int length2 = split.length - 1; length2 > 0; length2--) {
            if (split[length2].toLowerCase().indexOf("p") == 0 || split[length2].toLowerCase().indexOf("xhtml:p") == 0 || split[length2].toLowerCase().indexOf(XHtmlWriter.DIV) == 0 || split[length2].toLowerCase().indexOf("xhtml:div") == 0 || split[length2].toLowerCase().indexOf(XHtmlWriter.TD) == 0 || split[length2].toLowerCase().indexOf("xhtml:td") == 0 || split[length2].toLowerCase().indexOf(XHtmlWriter.TH) == 0 || split[length2].toLowerCase().indexOf("xhtml:th") == 0) {
                length = length2 + 1;
                break;
            }
        }
        StringBuilder sb = new StringBuilder();
        for (int i = 1; i < length; i++) {
            sb.append(CoreLabel.TAG_SEPARATOR).append(split[i]);
        }
        return sb.toString();
    }

    public static String getNextSibling(String str) {
        return getNextSibling(str, false);
    }

    public static String getNextSibling(String str, boolean z) {
        int lastIndexOf;
        int lastIndexOf2;
        if (z) {
            lastIndexOf = Math.max(str.lastIndexOf("td["), str.lastIndexOf("TD[")) + 2;
            lastIndexOf2 = str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, lastIndexOf);
        } else {
            lastIndexOf = str.lastIndexOf("[");
            lastIndexOf2 = str.lastIndexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END);
        }
        int max = Math.max(str.toLowerCase().lastIndexOf("/td"), str.toLowerCase().lastIndexOf("/xhtml:td"));
        int max2 = Math.max(str.toLowerCase().lastIndexOf("/th"), str.toLowerCase().lastIndexOf("/xhtml:th"));
        if (max > lastIndexOf2 && max > max2) {
            return str.substring(0, max) + str.substring(max).replace("/td", "/td[1]").replace("/TD", "/TD[1]").replace("/xhtml:td", "/xhtml:td[1]").replace("/xhtml:TD", "/xhtml:TD[1]");
        }
        if (max2 > lastIndexOf2 && max2 > max) {
            return str.substring(0, max2) + str.substring(max2).replace("/th", "/td[1]").replace("/TH", "/TD[1]").replace("/xhtml:th", "/xhtml:td[1]").replace("/xhtml:TH", "/xhtml:TD[1]");
        }
        if (lastIndexOf2 <= lastIndexOf || lastIndexOf == 1) {
            return str;
        }
        return str.substring(0, lastIndexOf + 1) + String.valueOf(Integer.valueOf(str.substring(lastIndexOf + 1, lastIndexOf2)).intValue() + 1) + str.substring(lastIndexOf2);
    }

    public static String getNextTableCell(String str) {
        return getNextSibling(str, true);
    }

    public static String getFirstTableCell(String str) {
        int indexOf = str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, Math.max(str.lastIndexOf("td["), str.lastIndexOf("TD[")) + 2);
        int max = Math.max(str.toLowerCase().lastIndexOf("/td"), str.toLowerCase().lastIndexOf("/xhtml:td"));
        int max2 = Math.max(str.toLowerCase().lastIndexOf("/th"), str.toLowerCase().lastIndexOf("/xhtml:th"));
        if (max <= indexOf || max <= max2) {
            return str;
        }
        return str.substring(0, max) + str.substring(max).replace("/td", "/td[1]").replace("/TD", "/TD[1]").replace("/xhtml:td", "/xhtml:td[1]").replace("/xhtml:TD", "/xhtml:TD[1]");
    }

    public static int getNumberOfTableRows(Document document, String str) {
        return getTableRows(document, str, getNextSibling(str, true)).size();
    }

    public static List<String[]> getTableRows(Document document, String str) {
        return getTableRows(document, str, getNextSibling(str, true));
    }

    public static List<String[]> getTableRows(Document document, String str, String str2) {
        ArrayList arrayList = new ArrayList();
        int max = Math.max(str.lastIndexOf("tr["), str.lastIndexOf("TR[")) + 2;
        int indexOf = str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, max);
        if (indexOf <= max || max == 1) {
            str = getNextTableRow(str);
            str2 = getNextTableRow(str2);
            max = Math.max(str.lastIndexOf("tr["), str.lastIndexOf("TR[")) + 2;
            indexOf = str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, max);
        }
        if (indexOf <= max || max == 1) {
            return arrayList;
        }
        List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, getParentNode(str.substring(0, max)));
        if (xhtmlNodes.size() == 0) {
            return arrayList;
        }
        int i = 0;
        NodeList childNodes = xhtmlNodes.get(0).getChildNodes();
        for (int i2 = 0; i2 < childNodes.getLength(); i2++) {
            if (childNodes.item(i2).getNodeName().toLowerCase().equals(XHtmlWriter.TR)) {
                i++;
            }
        }
        for (int i3 = 1; i3 <= i; i3++) {
            arrayList.add(new String[]{str.substring(0, max + 1) + String.valueOf(i3) + str.substring(indexOf), str2.substring(0, max + 1) + String.valueOf(i3) + str2.substring(indexOf)});
        }
        return arrayList;
    }

    public static String getNextTableRow(String str) {
        int lastIndexOf = str.toLowerCase().lastIndexOf(XHtmlWriter.TR);
        if (lastIndexOf == -1) {
            return str;
        }
        if (!str.substring(lastIndexOf + 2, lastIndexOf + 3).equals("[")) {
            return str.substring(0, lastIndexOf + 2) + "[1]" + str.substring(lastIndexOf + 2);
        }
        return str.substring(0, lastIndexOf + 3) + String.valueOf(Integer.valueOf(str.substring(lastIndexOf + 3, str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, lastIndexOf + 3))).intValue() + 1) + str.substring(str.indexOf(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, lastIndexOf + 3));
    }

    public static String getParentNode(String str) {
        return str.substring(0, str.lastIndexOf(CoreLabel.TAG_SEPARATOR));
    }

    public static int getNumberOfTableColumns(Document document, String str) {
        List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, getParentNode(getTableCellPath(str)));
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        for (int i = 0; i < xhtmlNodes.size(); i++) {
            List<Node> xhtmlNodes2 = XPathHelper.getXhtmlNodes(xhtmlNodes.get(i), "./*[(self::xhtml:td) or (self::xhtml:th)]");
            int i2 = 0;
            for (int i3 = 0; i3 < xhtmlNodes2.size(); i3++) {
                NamedNodeMap attributes = xhtmlNodes2.get(i3).getAttributes();
                int i4 = 0;
                while (true) {
                    if (i4 >= attributes.getLength()) {
                        break;
                    }
                    if (attributes.item(i4).getNodeName().equalsIgnoreCase(XHtmlWriter.COLSPAN)) {
                        i2 += Integer.valueOf(attributes.item(i4).getNodeValue()).intValue() - 1;
                        break;
                    }
                    i4++;
                }
                i2++;
            }
            if (linkedHashMap.containsKey(Integer.valueOf(i2))) {
                linkedHashMap.put(Integer.valueOf(i2), Integer.valueOf(((Integer) linkedHashMap.get(Integer.valueOf(i2))).intValue() + 1));
            } else {
                linkedHashMap.put(Integer.valueOf(i2), 1);
            }
        }
        if (linkedHashMap.entrySet().isEmpty()) {
            return 0;
        }
        int intValue = ((Integer) ((Map.Entry) CollectionHelper.sortByValue(linkedHashMap, false).entrySet().iterator().next()).getKey()).intValue();
        if (intValue == 0) {
            intValue = 1;
        }
        return intValue;
    }

    public static String getTextByXPath(Document document, String str) {
        if (document == null || str.length() == 0) {
            LOGGER.warn("document is NULL or xpath is empty");
            return "";
        }
        StringBuilder sb = new StringBuilder();
        try {
            Iterator<Node> it = XPathHelper.getXhtmlNodes(document, str).iterator();
            while (it.hasNext()) {
                sb.append((CharSequence) getSeparatedTextContents(it.next(), new StringBuilder(""))).append(Strings.SINGLE_SPACE_STRING);
            }
            return sb.toString();
        } catch (DOMException e) {
            LOGGER.error(str + Strings.SINGLE_SPACE_STRING + e.getMessage());
            return "#error#";
        } catch (Exception e2) {
            LOGGER.error(str + Strings.SINGLE_SPACE_STRING + e2.getMessage());
            return "#error#";
        } catch (OutOfMemoryError e3) {
            LOGGER.error(str + Strings.SINGLE_SPACE_STRING + e3.getMessage());
            return "#error#";
        }
    }

    private static StringBuilder getSeparatedTextContents(Node node, StringBuilder sb) throws OutOfMemoryError {
        Node firstChild = node.getFirstChild();
        for (int i = 0; firstChild != null && i < 50; i++) {
            if (firstChild.getNodeValue() != null && firstChild.getNodeType() == 3) {
                String trim = StringHelper.trim(firstChild.getNodeValue(), "-:.?!'\"");
                if (trim.length() > 0) {
                    sb.append(trim).append(Strings.SINGLE_SPACE_STRING);
                }
            }
            if (firstChild.getNodeName().equalsIgnoreCase(XHtmlWriter.BR)) {
                sb.append(FileHelper.NEWLINE_CHARACTER);
            }
            sb = getSeparatedTextContents(firstChild, sb);
            firstChild = firstChild.getNextSibling();
        }
        if (node.getNodeName().equalsIgnoreCase(XHtmlWriter.DIV)) {
            sb.append(FileHelper.NEWLINE_CHARACTER);
        }
        return sb;
    }

    public static List<String> getTextsByXPath(Document document, String str) {
        ArrayList arrayList = new ArrayList();
        if (document == null) {
            return arrayList;
        }
        Iterator<Node> it = XPathHelper.getXhtmlNodes(document, str).iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getTextContent());
        }
        return arrayList;
    }

    public static String getSiblingPage(Document document) {
        String str = "";
        String domain = UrlHelper.getDomain(document.getDocumentURI(), true);
        String decodeParameter = UrlHelper.decodeParameter(document.getDocumentURI());
        if (decodeParameter == null || decodeParameter.startsWith("file:")) {
            return str;
        }
        String removeAnchors = UrlHelper.removeAnchors(decodeParameter);
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        List<Node> nodes = XPathHelper.getNodes(document, "//@href");
        if (nodes == null) {
            return str;
        }
        for (int i = 0; i < nodes.size(); i++) {
            String makeFullUrl = UrlHelper.makeFullUrl(removeAnchors, UrlHelper.removeAnchors(nodes.get(i).getTextContent().trim()));
            if (makeFullUrl.length() != 0) {
                String decodeParameter2 = UrlHelper.decodeParameter(makeFullUrl);
                double calculateSimilarity = StringHelper.calculateSimilarity(decodeParameter2, removeAnchors, false);
                int lastIndexOf = removeAnchors.lastIndexOf(".");
                removeAnchors.length();
                if (lastIndexOf > domain.length() && removeAnchors.substring(lastIndexOf + 1).indexOf(LocationInfo.NA) > -1) {
                    int indexOf = lastIndexOf + 1 + removeAnchors.substring(lastIndexOf + 1).indexOf(LocationInfo.NA);
                }
                int lastIndexOf2 = decodeParameter2.lastIndexOf(".");
                if (lastIndexOf2 > domain.length()) {
                    int length = decodeParameter2.length();
                    if (decodeParameter2.substring(lastIndexOf2 + 1).indexOf(LocationInfo.NA) > -1) {
                        length = lastIndexOf2 + 1 + decodeParameter2.substring(lastIndexOf2 + 1).indexOf(LocationInfo.NA);
                    }
                    String substring = decodeParameter2.substring(lastIndexOf2 + 1, length);
                    if (!substring.equalsIgnoreCase("css")) {
                        if (!substring.equalsIgnoreCase("js")) {
                            if (!substring.equalsIgnoreCase("xml")) {
                                if (!substring.equalsIgnoreCase("ico")) {
                                    if (substring.equalsIgnoreCase("rss")) {
                                    }
                                }
                            }
                        }
                    }
                }
                if (!removeAnchors.equalsIgnoreCase(decodeParameter2)) {
                    linkedHashMap.put(decodeParameter2, Double.valueOf(calculateSimilarity));
                }
            }
        }
        LinkedHashMap sortByValue = CollectionHelper.sortByValue(linkedHashMap, false);
        if (sortByValue.entrySet().size() > 0) {
            try {
                URLEncoder.encode((String) ((Map.Entry) sortByValue.entrySet().iterator().next()).getKey(), "UTF-8");
                str = ((String) ((Map.Entry) sortByValue.entrySet().iterator().next()).getKey()).replace(Strings.SINGLE_SPACE_STRING, "%20");
            } catch (UnsupportedEncodingException e) {
                throw new IllegalStateException(e);
            }
        }
        LOGGER.info("sibling url: " + str);
        return str;
    }

    public static String extractTitle(Document document) {
        Iterator<Node> it = XPathHelper.getXhtmlNodes(document, "//title").iterator();
        return it.hasNext() ? it.next().getTextContent() : "";
    }

    public static String extractBodyContent(Document document) {
        String str = "";
        try {
            Iterator<Node> it = XPathHelper.getNodes(document, "//body").iterator();
            if (it.hasNext()) {
                str = it.next().getTextContent();
            }
        } catch (Exception e) {
            LOGGER.error(e.getMessage());
        } catch (OutOfMemoryError e2) {
            LOGGER.error(e2.getMessage());
        }
        return str;
    }

    public static List<String> extractDescription(Document document) {
        ArrayList arrayList = new ArrayList();
        Iterator<Node> it = XPathHelper.getNodes(document, "//meta").iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            Node next = it.next();
            if (next.getAttributes().getNamedItem(XHtmlWriter.NAME) != null && next.getAttributes().getNamedItem(XHtmlWriter.CONTENT) != null && next.getAttributes().getNamedItem(XHtmlWriter.NAME).getTextContent().equalsIgnoreCase("description")) {
                for (String str : next.getAttributes().getNamedItem(XHtmlWriter.CONTENT).getTextContent().split("\\s")) {
                    arrayList.add(str.trim());
                }
            }
        }
        return arrayList;
    }

    public static Map<String, String> extractMetaInformation(Document document) {
        HashMap hashMap = new HashMap();
        for (Node node : XPathHelper.getXhtmlNodes(document, "//meta")) {
            if (node.getAttributes().getNamedItem(XHtmlWriter.NAME) != null && node.getAttributes().getNamedItem(XHtmlWriter.CONTENT) != null) {
                String textContent = node.getAttributes().getNamedItem(XHtmlWriter.NAME).getTextContent();
                hashMap.put(textContent.toLowerCase(), node.getAttributes().getNamedItem(XHtmlWriter.CONTENT).getTextContent());
            }
        }
        return hashMap;
    }

    public static List<String> extractKeywords(Document document) {
        ArrayList arrayList = new ArrayList();
        Iterator<Node> it = XPathHelper.getXhtmlNodes(document, "//meta").iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            NamedNodeMap attributes = it.next().getAttributes();
            if (attributes.getNamedItem(XHtmlWriter.NAME) != null && attributes.getNamedItem(XHtmlWriter.CONTENT) != null && attributes.getNamedItem(XHtmlWriter.NAME).getTextContent().equalsIgnoreCase("keywords")) {
                for (String str : attributes.getNamedItem(XHtmlWriter.CONTENT).getTextContent().split(",")) {
                    arrayList.add(str.trim());
                }
            }
        }
        return arrayList;
    }

    public static String removeXPathIndices(String str) {
        return str.replaceAll("\\[(\\d)+\\]", "");
    }

    public static String removeXPathIndicesFromLastCountNode(String str) {
        return StringHelper.reverseString(StringHelper.reverseString(str).replaceFirst("\\](\\d)+\\[", ""));
    }

    public static String removeXPathIndices(String str, String[] strArr) {
        for (String str2 : strArr) {
            str = str.replaceAll(str2 + "\\[(\\d)+\\]", str2);
        }
        return str;
    }

    public static String removeXPathIndicesNot(String str, String[] strArr) {
        for (String str2 : strArr) {
            str = str.replaceAll(str2 + "\\[(\\d)+\\]", str2 + "\\{$1\\}");
        }
        String replaceAll = str.replaceAll("\\[(\\d)+\\]", "");
        for (String str3 : strArr) {
            replaceAll = replaceAll.replaceAll(str3 + "\\{(\\d)+\\}", str3 + "\\[$1\\]");
        }
        return replaceAll;
    }
}
