package ws.palladian.helper;

import com.aliasi.xml.XHtmlWriter;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.spi.LocationInfo;
import org.h2.message.Trace;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.helper.html.XPathHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/helper/UrlHelper.class */
public final class UrlHelper {
    private static final String TOP_LEVEL_DOMAINS = "ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|edu|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xxx|ye|yt|yu|za|zm|zw";
    private static final Logger LOGGER = LoggerFactory.getLogger(UrlHelper.class);
    private static final List<String> LINK_ATTRIBUTES = Arrays.asList(XHtmlWriter.HREF, XHtmlWriter.SRC);
    private static final Pattern SESSION_ID_PATTERN = Pattern.compile("(?<!\\w)(jsessionid=|s=|sid=|PHPSESSID=|sessionid=)[a-f0-9]{32}(?!\\w)");
    public static final Pattern URL_PATTERN = Pattern.compile("\\b(?:https?://)?([0-9a-zäöü-]{1,63}?\\.)+(?:ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|edu|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xxx|ye|yt|yu|za|zm|zw)(?:[?/](?:\\([^\\s()<>\\[\\]\"']{0,255}\\)|[^\\s()<>\\[\\]\"']{0,255})+(?:\\([^\\s()<>\\[\\]\"']{0,255}\\)|[^\\s.,;!?:()<>\\[\\]\"'])|/|\\b)", 2);

    private UrlHelper() {
    }

    public static String removeSessionId(String str) {
        if (str == null) {
            return null;
        }
        return SESSION_ID_PATTERN.matcher(str).replaceAll("");
    }

    public static void makeAbsoluteUrls(Document document) {
        String documentURI = document.getDocumentURI();
        String baseUrl = getBaseUrl(document);
        for (String str : LINK_ATTRIBUTES) {
            Iterator<Node> it = XPathHelper.getXhtmlNodes(document, "//*[@" + str + DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END).iterator();
            while (it.hasNext()) {
                Node namedItem = it.next().getAttributes().getNamedItem(str);
                String nodeValue = namedItem.getNodeValue();
                String makeFullUrl = makeFullUrl(documentURI, baseUrl, nodeValue);
                if (!makeFullUrl.equals(nodeValue)) {
                    LOGGER.debug("{} -> {}", nodeValue, makeFullUrl);
                    namedItem.setNodeValue(makeFullUrl);
                }
            }
        }
    }

    public static String getBaseUrl(Document document) {
        Node xhtmlNode = XPathHelper.getXhtmlNode(document, "//head/base/@href");
        if (xhtmlNode != null) {
            return xhtmlNode.getTextContent();
        }
        return null;
    }

    public static String makeFullUrl(String str, String str2, String str3) {
        if (str3 == null) {
            throw new NullPointerException("linkUrl must not be null");
        }
        if (str2 != null && !str2.endsWith(CoreLabel.TAG_SEPARATOR)) {
            str2 = str2.concat(CoreLabel.TAG_SEPARATOR);
        }
        return makeFullUrl((str == null || str2 == null) ? str != null ? str : str2 : makeFullUrl(str, str2), str3);
    }

    public static String makeFullUrl(String str, String str2) {
        String str3 = str2;
        if (str != null) {
            try {
                str3 = new URL(new URL(str), str2).toString();
            } catch (MalformedURLException e) {
            }
        }
        return str3;
    }

    public static String getCleanUrl(String str) {
        if (str == null) {
            str = "";
        }
        if (str.startsWith("https://")) {
            str = str.substring(8);
        }
        if (str.startsWith("http://")) {
            str = str.substring(7);
        }
        if (str.startsWith("www.")) {
            str = str.substring(4);
        }
        return str;
    }

    public static String removeAnchors(String str) {
        return str.replaceAll("#.*", "");
    }

    public static String getDomain(String str, boolean z) {
        String str2;
        try {
            URL url = new URL(str);
            if (url.getHost().isEmpty()) {
                LOGGER.trace("no domain specified {}", str);
            } else {
                str2 = (z ? url.getProtocol() + "://" : "") + url.getHost();
                LOGGER.trace("root url for {} -> {}", str, str2);
            }
        } catch (MalformedURLException e) {
            LOGGER.trace("could not determine domain for {}", str);
        }
        return str2;
    }

    public static String getDomain(String str) {
        return getDomain(str, true);
    }

    public static String getCanonicalUrl(String str) {
        if (str == null) {
            return "";
        }
        try {
            URL url = new URL(str);
            String protocol = url.getProtocol();
            String str2 = "";
            if (url.getPort() != -1 && url.getPort() != url.getDefaultPort()) {
                str2 = ":" + url.getPort();
            }
            String lowerCase = url.getHost().toLowerCase();
            String path = url.getPath();
            String[] strArr = null;
            if (url.getQuery() != null) {
                strArr = url.getQuery().split("&");
                Arrays.sort(strArr);
            }
            String[] split = path.split(CoreLabel.TAG_SEPARATOR);
            String str3 = CoreLabel.TAG_SEPARATOR;
            if (split.length > 0) {
                for (int i = 0; i < split.length; i++) {
                    split[i] = split[i].trim();
                    if (split[i].equals(DefaultExpressionEngine.DEFAULT_ESCAPED_DELIMITER)) {
                        split[i] = "";
                        if (split.length > 1 && i > 0) {
                            split[i - 1] = "";
                        }
                    }
                }
                for (String str4 : split) {
                    if (str4.length() > 0) {
                        str3 = str3 + str4 + CoreLabel.TAG_SEPARATOR;
                    }
                }
                if (split[split.length - 1].contains(".")) {
                    str3 = str3.substring(0, str3.length() - 1);
                }
                if (split[split.length - 1].contains(Trace.INDEX) && strArr == null) {
                    str3 = str3.replaceAll("index\\..+$", "");
                }
            }
            return protocol + "://" + str2 + lowerCase + str3 + (strArr != null ? LocationInfo.NA + StringUtils.join(strArr, "&") : "");
        } catch (MalformedURLException e) {
            LOGGER.trace("could not determine canonical url for {}", str);
            return "";
        }
    }

    public static String decodeParameter(String str) {
        try {
            return URLDecoder.decode(str, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("UTF-8 encoding unsupported. This should not happen.", e);
        }
    }

    public static String encodeParameter(String str) {
        try {
            return URLEncoder.encode(str, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("UTF-8 encoding unsupported. This should not happen.", e);
        }
    }

    public static List<String> extractUrls(String str) {
        return StringHelper.getRegexpMatches(URL_PATTERN, str);
    }

    public static boolean isLocalFile(URL url) {
        String protocol = url.getProtocol();
        String host = url.getHost();
        return "file".equalsIgnoreCase(protocol) && !(host != null && !"".equals(host));
    }
}
