package ws.palladian.extraction.location.sources.importers;

import edu.stanford.nlp.ling.CoreLabel;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import ws.palladian.extraction.location.AlternativeName;
import ws.palladian.extraction.location.GeoCoordinate;
import ws.palladian.extraction.location.ImmutableLocation;
import ws.palladian.extraction.location.LocationType;
import ws.palladian.extraction.location.persistence.LocationDatabase;
import ws.palladian.extraction.location.sources.LocationStore;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;
import ws.palladian.persistence.DatabaseManagerFactory;
import ws.palladian.retrieval.wikipedia.MultiStreamBZip2InputStream;
import ws.palladian.retrieval.wikipedia.WikipediaPage;
import ws.palladian.retrieval.wikipedia.WikipediaPageCallback;
import ws.palladian.retrieval.wikipedia.WikipediaPageContentHandler;
import ws.palladian.retrieval.wikipedia.WikipediaUtil;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/sources/importers/WikipediaLocationImporter.class */
public class WikipediaLocationImporter {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaLocationImporter.class);
    private static final Pattern IGNORED_PAGES = Pattern.compile("(?:Geography|Battle) of .*");
    private static final Map<String, LocationType> INFOBOX_MAPPING = loadMapping();
    private final LocationStore locationStore;
    private final Map<String, Integer> locationNamesIds;
    private final SAXParserFactory saxParserFactory;
    private final int idOffset;
    private final Set<AlternativeNameExtraction> nameExtraction;

    /* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/sources/importers/WikipediaLocationImporter$AlternativeNameExtraction.class */
    public enum AlternativeNameExtraction {
        REDIRECTS,
        PAGE
    }

    private static Map<String, LocationType> loadMapping() {
        InputStream inputStream = null;
        try {
            final HashMap newHashMap = CollectionHelper.newHashMap();
            inputStream = WikipediaLocationImporter.class.getResourceAsStream("/wikipediaLocationInfoboxMappings.csv");
            FileHelper.performActionOnEveryLine(inputStream, new LineAction() { // from class: ws.palladian.extraction.location.sources.importers.WikipediaLocationImporter.1
                @Override // ws.palladian.helper.io.LineAction
                public void performAction(String str, int i) {
                    if (str.isEmpty() || str.startsWith("#")) {
                        return;
                    }
                    String[] split = str.split("\\t");
                    newHashMap.put(split[0], LocationType.map(split[1]));
                }
            });
            FileHelper.close(inputStream);
            return newHashMap;
        } catch (Throwable th) {
            FileHelper.close(inputStream);
            throw th;
        }
    }

    public WikipediaLocationImporter(LocationStore locationStore, int i, AlternativeNameExtraction... alternativeNameExtractionArr) {
        Validate.notNull(locationStore, "locationStore must not be null", new Object[0]);
        Validate.isTrue(i >= 0);
        this.locationStore = locationStore;
        this.idOffset = i;
        this.saxParserFactory = SAXParserFactory.newInstance();
        this.locationNamesIds = CollectionHelper.newHashMap();
        this.nameExtraction = new HashSet(Arrays.asList(alternativeNameExtractionArr));
    }

    public void importDumpBz2(File file) {
        Validate.notNull(file, "dumpXml must not be null", new Object[0]);
        if (!file.isFile()) {
            throw new IllegalArgumentException("At least one of the given dump paths does not exist or is no file");
        }
        if (!file.getName().endsWith(".bz2")) {
            throw new IllegalArgumentException("XML dump file must be of type .bz2");
        }
        StopWatch stopWatch = new StopWatch();
        MultiStreamBZip2InputStream multiStreamBZip2InputStream = null;
        try {
            try {
                try {
                    MultiStreamBZip2InputStream multiStreamBZip2InputStream2 = new MultiStreamBZip2InputStream(new BufferedInputStream(new FileInputStream(file)));
                    LOGGER.info("Reading location data from {}", file);
                    importLocationPages(multiStreamBZip2InputStream2);
                    if (this.nameExtraction.contains(AlternativeNameExtraction.REDIRECTS)) {
                        multiStreamBZip2InputStream = new MultiStreamBZip2InputStream(new BufferedInputStream(new FileInputStream(file)));
                        LOGGER.info("Reading location alternative names from redirects in {}", file);
                        importAlternativeNames(multiStreamBZip2InputStream);
                    } else {
                        LOGGER.info("Skip reading location alternative names from redirects.");
                    }
                    FileHelper.close(multiStreamBZip2InputStream2, multiStreamBZip2InputStream);
                    LOGGER.info("Finished import in {}", stopWatch);
                } catch (FileNotFoundException e) {
                    throw new IllegalStateException(e);
                } catch (ParserConfigurationException e2) {
                    throw new IllegalStateException(e2);
                }
            } catch (IOException e3) {
                throw new IllegalStateException(e3);
            } catch (SAXException e4) {
                throw new IllegalStateException(e4);
            }
        } catch (Throwable th) {
            FileHelper.close(null, null);
            throw th;
        }
    }

    void importLocationPages(InputStream inputStream) throws ParserConfigurationException, SAXException, IOException {
        final int[] iArr = {0};
        this.saxParserFactory.newSAXParser().parse(inputStream, new WikipediaPageContentHandler(new WikipediaPageCallback() { // from class: ws.palladian.extraction.location.sources.importers.WikipediaLocationImporter.2
            @Override // ws.palladian.retrieval.wikipedia.WikipediaPageCallback
            public void callback(WikipediaPage wikipediaPage) {
                if (wikipediaPage.getNamespaceId() == 0 && !wikipediaPage.isRedirect()) {
                    if (WikipediaLocationImporter.IGNORED_PAGES.matcher(wikipediaPage.getTitle()).matches()) {
                        WikipediaLocationImporter.LOGGER.debug("Ignoring '{}' by blacklist", wikipediaPage.getTitle());
                        return;
                    }
                    String text = wikipediaPage.getText();
                    List<WikipediaPage.WikipediaInfobox> infoboxes = wikipediaPage.getInfoboxes();
                    if (infoboxes.isEmpty()) {
                        WikipediaLocationImporter.LOGGER.debug("Page '{}' has no infobox; skip", wikipediaPage.getTitle());
                        return;
                    }
                    LocationType locationType = null;
                    Iterator<WikipediaPage.WikipediaInfobox> it = infoboxes.iterator();
                    while (it.hasNext()) {
                        locationType = (LocationType) WikipediaLocationImporter.INFOBOX_MAPPING.get(it.next().getName());
                        if (locationType != null) {
                            break;
                        }
                    }
                    if (locationType == null) {
                        WikipediaLocationImporter.LOGGER.debug("Unmapped type for '{}'; ignore", wikipediaPage.getTitle());
                        return;
                    }
                    GeoCoordinate geoCoordinate = null;
                    Long l = null;
                    for (WikipediaUtil.MarkupLocation markupLocation : WikipediaUtil.extractCoordinateTag(text)) {
                        String display = markupLocation.getDisplay();
                        if (display != null && (display.contains("title") || display.equals("t"))) {
                            geoCoordinate = markupLocation;
                            l = markupLocation.getPopulation();
                        }
                    }
                    if (geoCoordinate == null) {
                        Iterator<WikipediaPage.WikipediaInfobox> it2 = infoboxes.iterator();
                        while (it2.hasNext()) {
                            Set<GeoCoordinate> extractCoordinatesFromInfobox = WikipediaUtil.extractCoordinatesFromInfobox(it2.next());
                            if (extractCoordinatesFromInfobox.size() > 0) {
                                geoCoordinate = (GeoCoordinate) CollectionHelper.getFirst(extractCoordinatesFromInfobox);
                            }
                        }
                    }
                    if (geoCoordinate != null) {
                        String cleanTitle = wikipediaPage.getCleanTitle();
                        int pageId = wikipediaPage.getPageId() + WikipediaLocationImporter.this.idOffset;
                        WikipediaLocationImporter.this.locationStore.save(new ImmutableLocation(pageId, cleanTitle, locationType, geoCoordinate.getLatitude(), geoCoordinate.getLongitude(), l));
                        WikipediaLocationImporter.LOGGER.trace("Saved location with ID {}, name {}", Integer.valueOf(wikipediaPage.getPageId()), cleanTitle);
                        WikipediaLocationImporter.this.locationNamesIds.put(wikipediaPage.getTitle(), Integer.valueOf(wikipediaPage.getPageId()));
                        int[] iArr2 = iArr;
                        iArr2[0] = iArr2[0] + 1;
                        if (WikipediaLocationImporter.this.nameExtraction.contains(AlternativeNameExtraction.PAGE)) {
                            List<String> sections = wikipediaPage.getSections();
                            if (sections.size() > 0) {
                                List<String> stringsInBold = WikipediaLocationImporter.getStringsInBold(sections.get(0));
                                HashSet newHashSet = CollectionHelper.newHashSet();
                                for (String str : stringsInBold) {
                                    if (!str.equals(cleanTitle)) {
                                        newHashSet.add(new AlternativeName(str));
                                    }
                                }
                                WikipediaLocationImporter.this.locationStore.addAlternativeNames(pageId, newHashSet);
                                WikipediaLocationImporter.LOGGER.debug("Extracted {} alternative names from page", Integer.valueOf(newHashSet.size()));
                            }
                        }
                    }
                }
            }
        }));
        LOGGER.info("Finished importing {} locations", Integer.valueOf(iArr[0]));
    }

    void importAlternativeNames(InputStream inputStream) throws ParserConfigurationException, SAXException, IOException {
        final int[] iArr = {0};
        this.saxParserFactory.newSAXParser().parse(inputStream, new WikipediaPageContentHandler(new WikipediaPageCallback() { // from class: ws.palladian.extraction.location.sources.importers.WikipediaLocationImporter.3
            @Override // ws.palladian.retrieval.wikipedia.WikipediaPageCallback
            public void callback(WikipediaPage wikipediaPage) {
                if (wikipediaPage.getNamespaceId() == 0 && wikipediaPage.isRedirect()) {
                    String redirectTitle = wikipediaPage.getRedirectTitle();
                    if (redirectTitle.contains("#")) {
                        WikipediaLocationImporter.LOGGER.debug("Skip anchor redirect '{}'", redirectTitle);
                        return;
                    }
                    Integer num = (Integer) WikipediaLocationImporter.this.locationNamesIds.get(redirectTitle);
                    if (num == null) {
                        return;
                    }
                    String cleanTitle = wikipediaPage.getCleanTitle();
                    if (cleanTitle.startsWith(redirectTitle + CoreLabel.TAG_SEPARATOR)) {
                        WikipediaLocationImporter.LOGGER.debug("Skip redirect from '{}' to '{}'", cleanTitle, redirectTitle);
                        return;
                    }
                    WikipediaLocationImporter.this.locationStore.addAlternativeNames(num.intValue() + WikipediaLocationImporter.this.idOffset, Collections.singleton(new AlternativeName(cleanTitle)));
                    WikipediaLocationImporter.LOGGER.debug("Save alternative name {} for location with ID {}", cleanTitle, num);
                    int[] iArr2 = iArr;
                    iArr2[0] = iArr2[0] + 1;
                }
            }
        }));
        LOGGER.info("Finished importing {} alternative names", Integer.valueOf(iArr[0]));
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static final List<String> getStringsInBold(String str) {
        Matcher matcher = Pattern.compile("'''([^']+)'''").matcher(str);
        ArrayList newArrayList = CollectionHelper.newArrayList();
        while (matcher.find()) {
            newArrayList.add(matcher.group(1));
        }
        return newArrayList;
    }

    public static void main(String[] strArr) throws Exception {
        LocationDatabase locationDatabase = (LocationDatabase) DatabaseManagerFactory.create(LocationDatabase.class, "locations2");
        locationDatabase.truncate();
        new WikipediaLocationImporter(locationDatabase, 100000000, AlternativeNameExtraction.PAGE).importDumpBz2(new File("/Users/pk/Downloads/enwiki-latest-pages-articles.xml.bz2"));
    }
}
