package ws.palladian.retrieval.wikipedia;

import com.aliasi.sentences.SentenceChunker;
import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.h2.expression.Function;
import org.jdesktop.swingx.JXLabel;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import ws.palladian.extraction.location.GeoCoordinate;
import ws.palladian.extraction.location.GeoUtils;
import ws.palladian.extraction.location.ImmutableGeoCoordinate;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.constants.Language;
import ws.palladian.helper.html.HtmlElement;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.HttpException;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.wikipedia.WikipediaPage;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/wikipedia/WikipediaUtil.class */
public final class WikipediaUtil {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaUtil.class);
    private static final Pattern REF_PATTERN = Pattern.compile("<ref(?:\\s[^>]*)?>[^<]*</ref>|<ref[^/>]*/>", 8);
    private static final Pattern HEADING_PATTERN = Pattern.compile("^={1,6}([^=]*)={1,6}$", 8);
    private static final Pattern CONVERT_PATTERN = Pattern.compile("\\{\\{convert\\|([\\d.]+)\\|([\\w°]+)(\\|[^}]*)?\\}\\}");
    private static final Pattern INTERNAL_LINK_PATTERN = Pattern.compile("\\[\\[([^|\\]]*)(?:\\|([^|\\]]*))?\\]\\]");
    private static final Pattern EXTERNAL_LINK_PATTERN = Pattern.compile("\\[http([^\\s]+)(?:\\s([^\\]]+))\\]");
    private static final Pattern REDIRECT_PATTERN = Pattern.compile("#redirect\\s*:?\\s*\\[\\[(.*)\\]\\]", 2);
    private static final Pattern OPEN_TAG_PATTERN = Pattern.compile("<\\w+[^>/]*>");
    private static final Pattern CLOSE_TAG_PATTERN = Pattern.compile("</\\w+[^>]*>");
    private static final Pattern COORDINATE_TAG_PATTERN = Pattern.compile("\\{\\{Coord\\|(-?\\d+(?:\\.\\d+)?)(?:\\|(\\d+(?:\\.\\d+)?)(?:\\|(\\d+(?:\\.\\d+)?))?)?(?:\\|([NS]))?\\|(-?\\d+(?:\\.\\d+)?)(?:\\|(\\d+(?:\\.\\d+)?)(?:\\|(\\d+(?:\\.\\d+)?))?)?(?:\\|([WE]))?((?:\\|[^}|<]+(?:<\\w+>[^<]*</\\w+>)?)*)\\}\\}", 2);

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/wikipedia/WikipediaUtil$MarkupLocation.class */
    public static final class MarkupLocation implements GeoCoordinate {
        double lat;
        double lng;
        Long population;
        String display;
        String name;
        String type;
        String region;

        public String toString() {
            return "MarkupLocation [lat=" + this.lat + ", lng=" + this.lng + ", population=" + this.population + ", display=" + this.display + ", name=" + this.name + ", type=" + this.type + ", region=" + this.region + DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END;
        }

        @Override // ws.palladian.extraction.location.GeoCoordinate
        public Double getLatitude() {
            return Double.valueOf(this.lat);
        }

        @Override // ws.palladian.extraction.location.GeoCoordinate
        public Double getLongitude() {
            return Double.valueOf(this.lng);
        }

        public String getDisplay() {
            return this.display;
        }

        public Long getPopulation() {
            return this.population;
        }
    }

    public static String stripMediaWikiMarkup(String str) {
        Validate.notNull(str, "markup must not be null", new Object[0]);
        return StringHelper.removeDoubleWhitespaces(removeArea(processLinks(processLinks(CONVERT_PATTERN.matcher(HEADING_PATTERN.matcher(HtmlHelper.stripHtmlTags(StringEscapeUtils.unescapeHtml4(REF_PATTERN.matcher(str).replaceAll("")))).replaceAll("$1\n").replaceAll("'''''|'''|''", "")).replaceAll("$1 $2"), INTERNAL_LINK_PATTERN), EXTERNAL_LINK_PATTERN), '{', '}').replaceAll("\\[\\[[^]]*\\]\\]", "").replaceAll("(?<!\n)\n(?![*\n])", Strings.SINGLE_SPACE_STRING)).replaceAll("\n{2,}", "\n\n").trim();
    }

    private static String processLinks(String str, Pattern pattern) {
        Matcher matcher = pattern.matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            String group = matcher.group(1);
            String group2 = matcher.group(2);
            String str2 = "";
            if (!group.toLowerCase().startsWith("category:")) {
                str2 = group2 != null ? group2 : group;
            }
            matcher.appendReplacement(stringBuffer, Matcher.quoteReplacement(str2));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    private static String removeArea(String str, char c, char c2) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (int i2 = 0; i2 < str.length(); i2++) {
            char charAt = str.charAt(i2);
            if (charAt == c) {
                i++;
            } else if (charAt == c2) {
                i--;
            } else if (i == 0) {
                sb.append(charAt);
            }
        }
        return sb.toString();
    }

    public static String extractSentences(String str) {
        return Pattern.compile("^(\\*.*|.*\\w)$", 8).matcher(str).replaceAll("").replaceAll("\n{2,}", "\n\n").trim();
    }

    public static String cleanTitle(String str) {
        return str.replaceAll("\\s\\([^)]*\\)", "").replaceAll(",.*", "");
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static String getRedirect(String str) {
        Matcher matcher = REDIRECT_PATTERN.matcher(str);
        if (matcher.find()) {
            return matcher.group(1);
        }
        return null;
    }

    public static final WikipediaPage retrieveArticle(String str, Language language) {
        try {
            String stringContent = HttpRetrieverFactory.getHttpRetriever().httpGet(String.format("http://%s.wikipedia.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvprop=content&format=json&titles=%s", language.getIso6391(), UrlHelper.encodeParameter(str.replace(Strings.SINGLE_SPACE_STRING, "_")))).getStringContent();
            try {
                JSONObject jSONObject = new JSONObject(stringContent).getJSONObject("query").getJSONObject("pages");
                Iterator keys = jSONObject.keys();
                if (!keys.hasNext()) {
                    return null;
                }
                JSONObject jSONObject2 = jSONObject.getJSONObject((String) keys.next());
                if (jSONObject2.has("missing")) {
                    return null;
                }
                return new WikipediaPage(jSONObject2.getInt("pageid"), jSONObject2.getInt("ns"), jSONObject2.getString("title"), jSONObject2.getJSONArray("revisions").getJSONObject(0).getString(Marker.ANY_MARKER));
            } catch (JSONException e) {
                throw new IllegalStateException("Error while parsing the JSON: " + e.getMessage() + ", JSON='" + stringContent + "'", e);
            }
        } catch (HttpException e2) {
            throw new IllegalStateException(e2);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static Map<String, String> extractTemplate(String str) {
        Validate.notNull(str, "markup must not be null", new Object[0]);
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        String substring = str.substring(2, str.length() - 2);
        if (str.toLowerCase().startsWith("{{geobox")) {
            substring = str.substring(str.indexOf(Function.ISO_WEEK) + 1, str.length() - 2);
        }
        int i = 0;
        for (String str2 : splitTemplateMarkup(substring)) {
            int i2 = i;
            i++;
            String valueOf = String.valueOf(i2);
            int indexOf = str2.indexOf(61);
            if (indexOf > 0) {
                String substring2 = str2.substring(0, indexOf);
                if (isBracketBalanced(substring2) && isTagBalanced(substring2)) {
                    valueOf = str2.substring(0, indexOf).trim();
                } else {
                    indexOf = -1;
                }
            }
            linkedHashMap.put(valueOf, str2.substring(indexOf + 1).trim());
        }
        return linkedHashMap;
    }

    static final List<String> splitTemplateMarkup(String str) {
        ArrayList newArrayList = CollectionHelper.newArrayList();
        int indexOf = str.indexOf(Function.ISO_WEEK) + 1;
        for (int i = indexOf; i < str.length(); i++) {
            char charAt = str.charAt(i);
            String substring = str.substring(0, i);
            if (charAt == '|' && isBracketBalanced(substring)) {
                newArrayList.add(str.substring(indexOf, i));
                indexOf = i + 1;
            }
        }
        newArrayList.add(str.substring(indexOf));
        return newArrayList;
    }

    private static final boolean isBracketBalanced(String str) {
        return str.replace("{{", "").replace("[", "").replace("<", "").length() - str.replace("}}", "").replace(DefaultExpressionEngine.DEFAULT_ATTRIBUTE_END, "").replace(">", "").length() == 0;
    }

    private static final boolean isTagBalanced(String str) {
        return StringHelper.countRegexMatches(str, OPEN_TAG_PATTERN) - StringHelper.countRegexMatches(str, CLOSE_TAG_PATTERN) == 0;
    }

    public static List<MarkupLocation> extractCoordinateTag(String str) {
        Validate.notNull(str, "text must not be null", new Object[0]);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        Matcher matcher = COORDINATE_TAG_PATTERN.matcher(str);
        while (matcher.find()) {
            MarkupLocation markupLocation = new MarkupLocation();
            markupLocation.lat = parseComponents(matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4));
            markupLocation.lng = parseComponents(matcher.group(5), matcher.group(6), matcher.group(7), matcher.group(8));
            String group = matcher.group(9);
            String coordinateParam = getCoordinateParam(group, XHtmlWriter.TYPE);
            if (coordinateParam != null) {
                markupLocation.population = getNumberInBrackets(coordinateParam);
                coordinateParam = coordinateParam.replaceAll("\\(.*\\)", "");
            }
            markupLocation.type = coordinateParam;
            markupLocation.region = getCoordinateParam(group, "region");
            markupLocation.display = getOtherParam(group, "display");
            markupLocation.name = getOtherParam(group, XHtmlWriter.NAME);
            newArrayList.add(markupLocation);
        }
        return newArrayList;
    }

    public static Set<GeoCoordinate> extractCoordinatesFromInfobox(WikipediaPage.WikipediaInfobox wikipediaInfobox) {
        Validate.notNull(wikipediaInfobox, "parsedTemplate must not be null", new Object[0]);
        HashSet newHashSet = CollectionHelper.newHashSet();
        try {
            String entry = wikipediaInfobox.getEntry("lat_deg", "latd", "lat_d", "lat_degrees", "source_lat_d", "mouth_lat_d");
            String entry2 = wikipediaInfobox.getEntry("lon_deg", "longd", "long_d", "long_degrees", "source_long_d", "mouth_long_d");
            if (StringUtils.isNotBlank(entry) && StringUtils.isNotBlank(entry2)) {
                String entry3 = wikipediaInfobox.getEntry("lat_min", "latm", "lat_m", "lat_minutes", "source_lat_m", "mouth_lat_m");
                String entry4 = wikipediaInfobox.getEntry("lat_sec", "lats", "lat_s", "lat_seconds", "source_lat_s", "mouth_lat_s");
                String entry5 = wikipediaInfobox.getEntry("lon_min", "longm", "long_m", "long_minutes", "source_long_m", "mouth_long_m");
                String entry6 = wikipediaInfobox.getEntry("lon_sec", "longs", "long_s", "long_seconds", "source_long_s", "mouth_long_s");
                newHashSet.add(new ImmutableGeoCoordinate(Double.valueOf(parseComponents(entry, entry3, entry4, wikipediaInfobox.getEntry("latNS", "lat_direction", "lat_NS", "source_lat_NS", "mouth_lat_NS"))), Double.valueOf(parseComponents(entry2, entry5, entry6, wikipediaInfobox.getEntry("longEW", "long_direction", "long_EW", "source_long_EW", "mouth_long_EW")))));
            }
        } catch (Exception e) {
            LOGGER.warn("Error while parsing: {}", e.getMessage());
        }
        String entry7 = wikipediaInfobox.getEntry("latitude");
        String entry8 = wikipediaInfobox.getEntry("longitude");
        if (StringUtils.isNotBlank(entry7) && StringUtils.isNotBlank(entry8)) {
            try {
                newHashSet.add(new ImmutableGeoCoordinate(Double.valueOf(entry7), Double.valueOf(entry8)));
            } catch (Exception e2) {
                try {
                    newHashSet.add(new ImmutableGeoCoordinate(Double.valueOf(GeoUtils.parseDms(entry7)), Double.valueOf(GeoUtils.parseDms(entry8))));
                } catch (Exception e3) {
                    try {
                        newHashSet.add(new ImmutableGeoCoordinate(Double.valueOf(parseDecDeg(entry7)), Double.valueOf(parseDecDeg(entry8))));
                    } catch (Exception e4) {
                        LOGGER.warn("Error while parsing: {} and/or {}: {}", entry7, entry8, e2.getMessage());
                    }
                }
            }
        }
        return newHashSet;
    }

    private static Long getNumberInBrackets(String str) {
        Matcher matcher = Pattern.compile("\\(([\\d,]+)\\)").matcher(str);
        if (!matcher.find()) {
            return null;
        }
        String replace = matcher.group(1).replace(",", "");
        try {
            return Long.valueOf(replace);
        } catch (NumberFormatException e) {
            LOGGER.error("Error parsing {}", replace);
            return null;
        }
    }

    private static String getOtherParam(String str, String str2) {
        for (String str3 : str.split("\\|")) {
            String[] split = str3.split("=");
            if (split.length == 2 && split[0].equals(str2)) {
                return split[1].trim();
            }
        }
        return null;
    }

    private static String getCoordinateParam(String str, String str2) {
        for (String str3 : str.split("\\|")) {
            for (String str4 : str3.split("_")) {
                String[] split = str4.split(":");
                if (split.length == 2 && split[0].equals(str2)) {
                    return split[1].trim();
                }
            }
        }
        return null;
    }

    private static double parseComponents(String str, String str2, String str3, String str4) {
        Validate.notEmpty(str, "deg must not be null or empty", new Object[0]);
        return ((SentenceChunker.SENTENCE_CHUNK_TYPE.equals(str4) || "W".equals(str4)) ? -1 : 1) * (Double.valueOf(str).doubleValue() + ((StringUtils.isNotBlank(str2) ? Double.valueOf(str2).doubleValue() : JXLabel.NORMAL) / 60.0d) + ((StringUtils.isNotBlank(str3) ? Double.valueOf(str3).doubleValue() : JXLabel.NORMAL) / 3600.0d));
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final List<WikipediaPage.WikipediaLink> getLinks(String str) {
        Validate.notNull(str, "markup must not be null", new Object[0]);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        Matcher matcher = INTERNAL_LINK_PATTERN.matcher(str);
        while (matcher.find()) {
            String group = matcher.group(1);
            int indexOf = group.indexOf(35);
            if (indexOf >= 0) {
                group = group.substring(0, indexOf);
            }
            String group2 = matcher.group(2);
            if (!group.toLowerCase().startsWith("category:")) {
                newArrayList.add(new WikipediaPage.WikipediaLink(group, group2));
            }
        }
        return newArrayList;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static List<String> getNamedMarkup(String str, String... strArr) {
        ArrayList newArrayList = CollectionHelper.newArrayList();
        String stripHtmlTags = HtmlHelper.stripHtmlTags(str, HtmlElement.COMMENTS);
        Matcher matcher = Pattern.compile("\\{\\{(?:" + StringUtils.join(strArr, "|").toLowerCase() + ")(?:\\s|\\|)", 2).matcher(stripHtmlTags);
        while (matcher.find()) {
            int start = matcher.start();
            int i = 0;
            int i2 = start;
            while (start < stripHtmlTags.length()) {
                char charAt = stripHtmlTags.charAt(i2);
                if (charAt == '{') {
                    i++;
                } else if (charAt == '}') {
                    i--;
                }
                if (i != 0) {
                    i2++;
                }
            }
            try {
                newArrayList.add(stripHtmlTags.substring(start, i2 + 1));
            } catch (StringIndexOutOfBoundsException e) {
                LOGGER.warn("Encountered {}, potentially caused by invalid markup.");
            }
        }
        return newArrayList;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static List<String> getSections(String str) {
        Validate.notNull(str, "markup must not be null", new Object[0]);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        Matcher matcher = HEADING_PATTERN.matcher(str);
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                newArrayList.add(str.substring(i2));
                return newArrayList;
            }
            int start = matcher.start();
            newArrayList.add(str.substring(i2, start));
            i = start;
        }
    }

    public static double parseDecDeg(String str) {
        int i;
        Validate.notNull(str, "string must not be null", new Object[0]);
        Map<String, String> extractTemplate = extractTemplate(str);
        String str2 = (String) CollectionHelper.getTrying(extractTemplate, "deg", "0");
        String str3 = (String) CollectionHelper.getTrying(extractTemplate, "min", SchemaSymbols.ATTVAL_TRUE_1);
        String str4 = (String) CollectionHelper.getTrying(extractTemplate, "sec", "2");
        String str5 = (String) CollectionHelper.getTrying(extractTemplate, "hem", "3");
        try {
            double doubleValue = StringUtils.isNotBlank(str2) ? Double.valueOf(str2).doubleValue() : JXLabel.NORMAL;
            double doubleValue2 = StringUtils.isNotBlank(str3) ? Double.valueOf(str3).doubleValue() : JXLabel.NORMAL;
            double doubleValue3 = StringUtils.isNotBlank(str4) ? Double.valueOf(str4).doubleValue() : JXLabel.NORMAL;
            if (StringUtils.isNotBlank(str5)) {
                i = ("W".equals(str5) || SentenceChunker.SENTENCE_CHUNK_TYPE.equals(str5)) ? -1 : 1;
            } else {
                i = str2.startsWith("-") ? -1 : 1;
            }
            double abs = i * (Math.abs(doubleValue) + (doubleValue2 / 60.0d) + (doubleValue3 / 3600.0d));
            String str6 = (String) CollectionHelper.getTrying(extractTemplate, "rnd", "4");
            if (StringUtils.isNotBlank(str6)) {
                abs = MathHelper.round(abs, Integer.valueOf(str6).intValue());
            }
            return abs;
        } catch (Exception e) {
            throw new NumberFormatException("The coordinate data from \"" + str + "\" could not be parsed.");
        }
    }

    private WikipediaUtil() {
    }

    public static void main(String[] strArr) {
        WikipediaPage retrieveArticle = retrieveArticle("Charles River", Language.ENGLISH);
        CollectionHelper.print(extractTemplate(getNamedMarkup(retrieveArticle.getText(), "geobox").get(0)));
        System.out.println(retrieveArticle);
    }
}
