package ws.palladian.helper.html;

import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import java.io.File;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMResult;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.io.FileHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/helper/html/HtmlHelper.class */
public final class HtmlHelper {
    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlHelper.class);
    private static final List<String> BLOCK_ELEMENTS = Arrays.asList(XHtmlWriter.ADDRESS, XHtmlWriter.BLOCKQUOTE, XHtmlWriter.DIV, XHtmlWriter.DL, XHtmlWriter.FIELDSET, XHtmlWriter.FORM, XHtmlWriter.H1, XHtmlWriter.H2, XHtmlWriter.H3, XHtmlWriter.H4, XHtmlWriter.H5, XHtmlWriter.H6, XHtmlWriter.HR, XHtmlWriter.NOSCRIPT, XHtmlWriter.OL, "p", XHtmlWriter.PRE, "table", XHtmlWriter.UL, XHtmlWriter.DD, XHtmlWriter.DT, XHtmlWriter.LI, XHtmlWriter.TBODY, XHtmlWriter.TD, XHtmlWriter.TFOOT, XHtmlWriter.TH, XHtmlWriter.THEAD, XHtmlWriter.TR, XHtmlWriter.BUTTON, XHtmlWriter.DEL, XHtmlWriter.INS, XHtmlWriter.MAP, XHtmlWriter.OBJECT, XHtmlWriter.SCRIPT, XHtmlWriter.BR);
    private static final List<String> IGNORE_INSIDE = Arrays.asList(XHtmlWriter.SCRIPT, "style");
    private static final Pattern HTML_TO_READABLE_TEXT = Pattern.compile("\\<br\\s?\\/?\\>", 2);
    private static final Pattern HTML_TO_READABLE_TEXT2 = Pattern.compile("\\<\\/p\\>", 2);
    private static final Pattern NORMALIZE_LINES = Pattern.compile("^\\s+$|^[ \t]+|[ \t]+$", 8);
    private static final Pattern STRIP_ALL_TAGS = Pattern.compile("<!--.*?-->|<script.*?>.*?</script>|<style.*?>.*?</style>|<.*?>", 34);
    private static final ThreadLocal<TransformerFactory> TRANSFORMER_FACTORIES = new ThreadLocal<TransformerFactory>() { // from class: ws.palladian.helper.html.HtmlHelper.1
        /* JADX INFO: Access modifiers changed from: protected */
        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.lang.ThreadLocal
        public TransformerFactory initialValue() {
            return TransformerFactory.newInstance();
        }
    };

    private HtmlHelper() {
    }

    public static int countTags(String str) {
        return countTags(str, false);
    }

    public static int countTagLength(String str) {
        int i = 0;
        Matcher matcher = Pattern.compile("<(.*?)>", 34).matcher(str);
        while (matcher.find()) {
            i += matcher.group(1).length() + 2;
        }
        return i;
    }

    public static int countTags(String str, boolean z) {
        HashSet hashSet = new HashSet();
        int i = 0;
        Matcher matcher = Pattern.compile("(\\<.*?>)", 34).matcher(str);
        while (matcher.find()) {
            i++;
            hashSet.add(matcher.group());
        }
        if (z) {
            i = hashSet.size();
        }
        return i;
    }

    public static String stripHtmlTags(String str) {
        return STRIP_ALL_TAGS.matcher(str).replaceAll("");
    }

    @Deprecated
    public static String stripHtmlTags(String str, Set<HtmlElement> set) {
        if (str == null) {
            return null;
        }
        if (set.isEmpty()) {
            return str;
        }
        ArrayList newArrayList = CollectionHelper.newArrayList();
        if (set.contains(HtmlElement.COMMENTS)) {
            newArrayList.add("<!--.*?-->");
        }
        if (set.contains(HtmlElement.SCRIPT)) {
            newArrayList.add("<script.*?>.*?</script>");
        }
        if (set.contains(HtmlElement.CSS)) {
            newArrayList.add("<style.*?>.*?</style>");
        }
        if (set.contains(HtmlElement.TAG)) {
            newArrayList.add("<.*?>");
        }
        return Pattern.compile(StringUtils.join(newArrayList, "|"), 34).matcher(str).replaceAll("");
    }

    public static String stripHtmlTags(String str, HtmlElement... htmlElementArr) {
        return stripHtmlTags(str, EnumSet.copyOf((Collection) Arrays.asList(htmlElementArr)));
    }

    public static String joinTagsAndRemoveNewLines(String str) {
        return str == null ? str : str.replaceAll(">\\s*?<", "><").replaceAll(FileHelper.NEWLINE_CHARACTER, "");
    }

    public static String removeConcreteHtmlTag(String str, String str2) {
        return removeConcreteHtmlTag(str, str2, str2);
    }

    public static String removeConcreteHtmlTag(String str, String str2, String str3) {
        String str4 = str;
        Iterator<String> it = getConcreteTags(str4, str2, str3).iterator();
        while (it.hasNext()) {
            str4 = str4.replace(it.next(), "");
        }
        return str4;
    }

    public static List<String> getConcreteTags(String str, String str2) {
        return getConcreteTags(str, str2, str2);
    }

    public static List<String> getConcreteTags(String str, String str2, String str3) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile(str2.equals(str3) ? "<" + str2 + ".*?>(.*?</" + str3 + ">)?" : str2 + ".*?" + str3, 34).matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group(0));
        }
        return arrayList;
    }

    public static String documentToReadableText(Node node) {
        final StringBuilder sb = new StringBuilder();
        try {
            TRANSFORMER_FACTORIES.get().newTransformer().transform(new DOMSource(node), new SAXResult(new DefaultHandler() { // from class: ws.palladian.helper.html.HtmlHelper.2
                boolean ignoreCharacters = false;

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    String lowerCase = str2.toLowerCase();
                    if (HtmlHelper.IGNORE_INSIDE.contains(lowerCase)) {
                        this.ignoreCharacters = true;
                    } else if (HtmlHelper.BLOCK_ELEMENTS.contains(lowerCase) || str2.equalsIgnoreCase(XHtmlWriter.BR)) {
                        sb.append(FileHelper.NEWLINE_CHARACTER);
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void endElement(String str, String str2, String str3) throws SAXException {
                    String lowerCase = str2.toLowerCase();
                    if (HtmlHelper.IGNORE_INSIDE.contains(lowerCase)) {
                        this.ignoreCharacters = false;
                    } else if (HtmlHelper.BLOCK_ELEMENTS.contains(lowerCase)) {
                        sb.append(FileHelper.NEWLINE_CHARACTER);
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void characters(char[] cArr, int i, int i2) throws SAXException {
                    if (this.ignoreCharacters) {
                        return;
                    }
                    sb.append(cArr, i, i2);
                }
            }));
        } catch (TransformerConfigurationException e) {
            LOGGER.error("htmlDocToString:TransformerConfigurationException", (Throwable) e);
        } catch (TransformerException e2) {
            LOGGER.error("htmlDocToString:TransformerException", (Throwable) e2);
        } catch (TransformerFactoryConfigurationError e3) {
            LOGGER.error("htmlDocToString:TransformerFactoryConfigurationError", (Throwable) e3);
        }
        return NORMALIZE_LINES.matcher(sb.toString()).replaceAll("").replaceAll("\n{3,}", "\n\n").replaceAll(" {2,}", Strings.SINGLE_SPACE_STRING).trim();
    }

    public static String htmlToReadableText(String str) {
        return stripHtmlTags(HTML_TO_READABLE_TEXT2.matcher(HTML_TO_READABLE_TEXT.matcher(str).replaceAll(FileHelper.NEWLINE_CHARACTER)).replaceAll(FileHelper.NEWLINE_CHARACTER));
    }

    public static String extractTagElement(String str, String str2, String str3) {
        String str4 = "";
        Matcher matcher = Pattern.compile(str, 34).matcher(str2);
        while (matcher.find()) {
            String group = matcher.group(0);
            if (!"".equals(str3)) {
                group = group.replaceFirst(str3, "").replaceFirst(str3.toUpperCase(Locale.ENGLISH), "").replaceFirst(str3.toLowerCase(Locale.ENGLISH), "");
            }
            str4 = group.replaceAll("\"", "").replaceAll("'", "");
        }
        return str4;
    }

    public static boolean isSimpleElement(Node node) {
        List asList = Arrays.asList(XHtmlWriter.B, "i", XHtmlWriter.EM, XHtmlWriter.INS, XHtmlWriter.DEL, "s", XHtmlWriter.SMALL, XHtmlWriter.BIG, XHtmlWriter.STRONG, XHtmlWriter.U);
        if (node.getNodeType() == 1) {
            return asList.contains(node.getNodeName().toLowerCase());
        }
        return false;
    }

    public static boolean isHeadlineTag(String str) {
        return Arrays.asList(XHtmlWriter.H1, XHtmlWriter.H2, XHtmlWriter.H3, XHtmlWriter.H4, XHtmlWriter.H5, XHtmlWriter.H6).contains(str.toLowerCase());
    }

    public static boolean isHeadlineTag(Node node) {
        return isHeadlineTag(node.getNodeName());
    }

    public static Node removeWhitespace(Node node) {
        Node cloneNode = node.cloneNode(true);
        try {
            NodeList nodeList = (NodeList) XPathFactory.newInstance().newXPath().compile("//text()[normalize-space(.) = '']").evaluate(cloneNode, XPathConstants.NODESET);
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node item = nodeList.item(i);
                item.getParentNode().removeChild(item);
            }
        } catch (XPathExpressionException e) {
            LOGGER.error("Exception while removing whitespace", (Throwable) e);
        } catch (DOMException e2) {
            LOGGER.error("Exception while removing whitespace", (Throwable) e2);
        }
        return cloneNode;
    }

    public static boolean writeToFile(Node node, File file) {
        boolean z = false;
        try {
            TRANSFORMER_FACTORIES.get().newTransformer().transform(new DOMSource(node), new StreamResult(file));
            z = true;
        } catch (TransformerConfigurationException e) {
            LOGGER.error("Exception while writing to file", (Throwable) e);
        } catch (TransformerException e2) {
            LOGGER.error("Exception while writing to file", (Throwable) e2);
        } catch (TransformerFactoryConfigurationError e3) {
            LOGGER.error("Exception while writing to file", (Throwable) e3);
        }
        return z;
    }

    public static String getInnerXml(Node node) {
        StringBuilder sb = new StringBuilder();
        NodeList childNodes = node.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            String xmlToString = xmlToString(childNodes.item(i), true);
            if (xmlToString != null) {
                sb.append(xmlToString);
            }
        }
        return sb.toString();
    }

    public static Document createDocument() {
        try {
            DocumentBuilderFactory newInstance = DocumentBuilderFactory.newInstance();
            newInstance.setNamespaceAware(true);
            return newInstance.newDocumentBuilder().newDocument();
        } catch (ParserConfigurationException e) {
            LOGGER.error("createDocument:ParserConfigurationException, throwing RuntimeException", (Throwable) e);
            throw new RuntimeException(e);
        }
    }

    public static void removeAll(Node node, short s) {
        removeAll(node, s, null);
    }

    public static void removeAll(Node node, short s, String str) {
        if (node.getNodeType() == s && (str == null || node.getNodeName().equals(str))) {
            node.getParentNode().removeChild(node);
            return;
        }
        NodeList childNodes = node.getChildNodes();
        for (int length = childNodes.getLength() - 1; length >= 0; length--) {
            removeAll(childNodes.item(length), s, str);
        }
    }

    public static Document cloneDocument(Document document) {
        Document document2 = null;
        try {
            Transformer newTransformer = TRANSFORMER_FACTORIES.get().newTransformer();
            DOMSource dOMSource = new DOMSource(document);
            DOMResult dOMResult = new DOMResult();
            newTransformer.transform(dOMSource, dOMResult);
            document2 = (Document) dOMResult.getNode();
        } catch (TransformerConfigurationException e) {
            LOGGER.error("cloneDocument:TransformerConfigurationException " + e.getMessage());
        } catch (TransformerException e2) {
            LOGGER.error("cloneDocument:TransformerException " + e2.getMessage());
        } catch (TransformerFactoryConfigurationError e3) {
            LOGGER.error("cloneDocument:TransformerFactoryConfigurationError " + e3.getMessage());
        } catch (DOMException e4) {
            LOGGER.error("cloneDocument:DOMException " + e4.getMessage());
        }
        return document2;
    }

    public static String xmlToString(Node node, boolean z) {
        Validate.notNull(node, "node must not be null.", new Object[0]);
        String str = null;
        try {
            DOMSource dOMSource = new DOMSource(node);
            StringWriter stringWriter = new StringWriter();
            StreamResult streamResult = new StreamResult(stringWriter);
            Transformer newTransformer = TRANSFORMER_FACTORIES.get().newTransformer();
            if (z) {
                newTransformer.setOutputProperty("omit-xml-declaration", "yes");
            }
            newTransformer.setOutputProperty(XHtmlWriter.METHOD, "xml");
            newTransformer.transform(dOMSource, streamResult);
            str = stringWriter.toString();
        } catch (TransformerConfigurationException e) {
            LOGGER.error("Encountered TransformerConfigurationException while transforming Node: " + e.getMessage());
        } catch (TransformerException e2) {
            LOGGER.error("Encountered TransformerException while transforming Node: " + e2.getMessage());
        }
        return str;
    }

    public static String xmlToString(Node node) {
        return xmlToString(node, false);
    }

    public static void printDom(Node node) {
        printDom(node, 0);
    }

    private static void printDom(Node node, int i) {
        String repeat = StringUtils.repeat(Strings.SINGLE_SPACE_STRING, i);
        System.out.println(repeat + node.getNodeName() + "(" + node.getPrefix() + " : " + node.getNamespaceURI() + ")");
        if (node.getAttributes() != null) {
            for (int i2 = 0; i2 < node.getAttributes().getLength(); i2++) {
                System.out.println(repeat + "@" + node.getAttributes().item(i2));
            }
        }
        Node firstChild = node.getFirstChild();
        while (true) {
            Node node2 = firstChild;
            if (node2 == null) {
                return;
            }
            printDom(node2, i + 1);
            firstChild = node2.getNextSibling();
        }
    }

    public static String documentToText(Node node) {
        if (node == null || node.getNodeName() == null || node.getNodeName().equalsIgnoreCase(XHtmlWriter.SCRIPT) || node.getNodeName().equalsIgnoreCase("style") || node.getNodeName().equalsIgnoreCase("#comment") || node.getNodeName().equalsIgnoreCase(XHtmlWriter.OPTION) || node.getNodeName().equalsIgnoreCase(XHtmlWriter.META) || node.getNodeName().equalsIgnoreCase(XHtmlWriter.HEAD)) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        String textContent = node.getTextContent();
        if (textContent != null && node.getNodeName().equalsIgnoreCase("#text")) {
            sb.append(textContent);
        }
        if (isWrappingNode(node)) {
            sb.append(FileHelper.NEWLINE_CHARACTER);
        }
        try {
            for (Node firstChild = node.getFirstChild(); firstChild != null; firstChild = firstChild.getNextSibling()) {
                sb.append(documentToText(firstChild));
            }
        } catch (Exception e) {
            LOGGER.error(e.getMessage());
        }
        return sb.toString().replaceAll("[ ]{2,}", "");
    }

    public static Set<String> getLinks(Document document, boolean z, boolean z2, String str) {
        boolean equalsIgnoreCase;
        HashSet hashSet = new HashSet();
        if (document == null) {
            return hashSet;
        }
        String removeAnchors = UrlHelper.removeAnchors(document.getDocumentURI());
        String domain = UrlHelper.getDomain(removeAnchors, false);
        Node xhtmlNode = XPathHelper.getXhtmlNode(document, "//head/base/@href");
        String textContent = xhtmlNode != null ? xhtmlNode.getTextContent() : null;
        List<Node> xhtmlNodes = XPathHelper.getXhtmlNodes(document, "//a/@href");
        for (int i = 0; i < xhtmlNodes.size(); i++) {
            String makeFullUrl = UrlHelper.makeFullUrl(removeAnchors, textContent, UrlHelper.removeAnchors(xhtmlNodes.get(i).getTextContent().trim()));
            if (makeFullUrl.length() != 0 && ((((equalsIgnoreCase = UrlHelper.getDomain(makeFullUrl, false).replaceFirst("[a-zA-Z-_]+\\.(?=[a-z]+\\.)", "").equalsIgnoreCase(domain)) && z) || (!equalsIgnoreCase && z2)) && makeFullUrl.startsWith(str))) {
                hashSet.add(makeFullUrl);
            }
        }
        return hashSet;
    }

    public static Set<String> getLinks(Document document, boolean z, boolean z2) {
        return getLinks(document, z, z2, "");
    }

    public static String getDocumentTextDump(Document document) {
        return (document == null || document.getLastChild() == null) ? "" : document.getLastChild().getTextContent();
    }

    private static boolean isWrappingNode(Node node) {
        return BLOCK_ELEMENTS.contains(node.getNodeName().toLowerCase());
    }

    public static List<Node> getAllSiblings(Node node) {
        Validate.notNull(node, "node must not be null", new Object[0]);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        NodeList childNodes = node.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            newArrayList.add(item);
            if (item.getNodeType() == 1) {
                newArrayList.addAll(getAllSiblings(item));
            }
        }
        return newArrayList;
    }
}
