package ws.palladian.retrieval.wikipedia;

import com.aliasi.xml.XHtmlWriter;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.concurrent.TimeUnit;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import ws.palladian.helper.StopWatch;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/wikipedia/WikipediaPageContentHandler.class */
public class WikipediaPageContentHandler extends DefaultHandler {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaPageContentHandler.class);
    private int pageCounter;
    private final StopWatch stopWatch;
    private final WikipediaPageCallback callback;
    private StringBuilder buffer = new StringBuilder();
    private boolean bufferText = false;
    private boolean inRevision = false;
    private String title;
    private int pageId;
    private int namespaceId;
    private String text;

    public WikipediaPageContentHandler(WikipediaPageCallback wikipediaPageCallback) {
        Validate.notNull(wikipediaPageCallback, "callback must not be null", new Object[0]);
        this.callback = wikipediaPageCallback;
        this.stopWatch = new StopWatch();
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (str3.equals("text") || str3.equals("title") || str3.equals("ns") || (str3.equals(XHtmlWriter.ID) && !this.inRevision)) {
            this.bufferText = true;
        }
        if (str3.equals("revision")) {
            this.inRevision = true;
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        if (str3.equals("revision")) {
            this.inRevision = false;
            return;
        }
        if (str3.equals(XHtmlWriter.ID) && !this.inRevision) {
            this.pageId = Integer.valueOf(getBuffer()).intValue();
            return;
        }
        if (str3.equals("text")) {
            this.text = getBuffer();
            return;
        }
        if (str3.equals("title")) {
            this.title = getBuffer();
        } else if (str3.equals("ns")) {
            this.namespaceId = Integer.valueOf(getBuffer()).intValue();
        } else if (str3.equals("page")) {
            processPage();
        }
    }

    private void processPage() {
        int i = this.pageCounter + 1;
        this.pageCounter = i;
        if (i % 1000 == 0) {
            LOGGER.info("Processed {} pages, throughput {} pages/second.", Integer.valueOf(this.pageCounter), Integer.valueOf(Math.round(this.pageCounter / ((float) TimeUnit.MILLISECONDS.toSeconds(this.stopWatch.getElapsedTime())))));
        }
        this.callback.callback(new WikipediaPage(this.pageId, this.namespaceId, this.title, this.text));
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        if (this.bufferText) {
            this.buffer.append(cArr, i, i2);
        }
    }

    private String getBuffer() {
        try {
            String sb = this.buffer.toString();
            this.buffer = new StringBuilder();
            this.bufferText = false;
            return sb;
        } catch (Throwable th) {
            this.buffer = new StringBuilder();
            this.bufferText = false;
            throw th;
        }
    }

    public static void main(String[] strArr) throws Exception {
        SAXParserFactory.newInstance().newSAXParser().parse(new MultiStreamBZip2InputStream(new BufferedInputStream(new FileInputStream(new File("/Users/pk/Downloads/enwiki-latest-pages-articles.xml.bz2")))), new WikipediaPageContentHandler(new WikipediaPageCallback() { // from class: ws.palladian.retrieval.wikipedia.WikipediaPageContentHandler.1
            @Override // ws.palladian.retrieval.wikipedia.WikipediaPageCallback
            public void callback(WikipediaPage wikipediaPage) {
                if (wikipediaPage.getPageId() == 27394805) {
                    System.out.println(wikipediaPage);
                    System.exit(0);
                }
            }
        }));
    }
}
