package ws.palladian.retrieval.analysis;

import com.aliasi.xml.XHtmlWriter;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.classification.utils.ClassificationUtils;
import ws.palladian.helper.ProgressHelper;
import ws.palladian.helper.StopWatch;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.constants.SizeUnit;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.nlp.StringHelper;
import ws.palladian.retrieval.DocumentRetriever;
import ws.palladian.retrieval.HttpResult;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.RetrieverCallback;
import ws.palladian.retrieval.ranking.RankingServiceException;
import ws.palladian.retrieval.ranking.services.SemRush;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/analysis/SitemapAnalyzer.class */
public class SitemapAnalyzer {
    private static final Logger LOGGER = LoggerFactory.getLogger(SitemapAnalyzer.class);
    private int numThreads = 10;
    private final ConcurrentHashMap<String, Map<String, Object>> resultTable = new ConcurrentHashMap<>();
    private final CountMap<String> internalInboundLinkMap = CountMap.create();

    public int getNumThreads() {
        return this.numThreads;
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public void analyzeSitemap(String str, String str2) {
        final StopWatch stopWatch = new StopWatch();
        LOGGER.info("getting the page urls");
        List<String> urls = new SitemapRetriever().getUrls(str);
        final int size = urls.size();
        final AtomicInteger atomicInteger = new AtomicInteger(1);
        RetrieverCallback<Document> retrieverCallback = new RetrieverCallback<Document>() { // from class: ws.palladian.retrieval.analysis.SitemapAnalyzer.1
            @Override // ws.palladian.retrieval.RetrieverCallback
            public void onFinishRetrieval(Document document) {
                HashMap newHashMap = CollectionHelper.newHashMap();
                Set<String> links = HtmlHelper.getLinks(document, true, false);
                Set<String> links2 = HtmlHelper.getLinks(document, false, true);
                synchronized (SitemapAnalyzer.this.internalInboundLinkMap) {
                    for (String str3 : links) {
                        if (!str3.equalsIgnoreCase(document.getDocumentURI())) {
                            SitemapAnalyzer.this.internalInboundLinkMap.add(str3);
                        }
                    }
                }
                try {
                    newHashMap.put("accessible", Boolean.valueOf(((HttpResult) document.getUserData(DocumentRetriever.HTTP_RESULT_KEY)).getStatusCode() < 400));
                } catch (Exception e) {
                }
                int countWords = StringHelper.countWords(HtmlHelper.stripHtmlTags(HtmlHelper.getInnerXml(document)));
                Float f = null;
                try {
                    f = new SemRush().getRanking(document.getDocumentURI()).getValues().get(SemRush.BACKLINKS_PAGE);
                } catch (RankingServiceException e2) {
                    SitemapAnalyzer.LOGGER.error("Error retrieving ranking: " + e2.getMessage(), (Throwable) e2);
                }
                newHashMap.put("in-ext", f);
                newHashMap.put("out-int", Integer.valueOf(links.size()));
                newHashMap.put("out-ext", Integer.valueOf(links2.size()));
                newHashMap.put("#words", Integer.valueOf(countWords));
                newHashMap.put(XHtmlWriter.SIZE, Long.valueOf(SizeUnit.BYTES.toKilobytes(r0.length())));
                SitemapAnalyzer.this.resultTable.put(document.getDocumentURI(), newHashMap);
                ProgressHelper.printProgress(atomicInteger.intValue(), size, 0.2d, stopWatch);
                atomicInteger.incrementAndGet();
            }
        };
        LOGGER.info("starting to process each page (" + urls.size() + " in total), time elapsed: " + stopWatch.getElapsedTimeString());
        HttpRetriever httpRetriever = HttpRetrieverFactory.getHttpRetriever();
        httpRetriever.setConnectionTimeout(TimeUnit.SECONDS.toMillis(120L));
        httpRetriever.setSocketTimeout(TimeUnit.SECONDS.toMillis(120L));
        DocumentRetriever documentRetriever = new DocumentRetriever(httpRetriever);
        documentRetriever.setNumThreads(getNumThreads());
        documentRetriever.getWebDocuments(urls, retrieverCallback);
        LOGGER.info("gathering all internal inbound link information, time elapsed: " + stopWatch.getElapsedTimeString());
        for (String str3 : urls) {
            Map<String, Object> map = this.resultTable.get(str3);
            if (map != null) {
                map.put("in-int", Integer.valueOf(this.internalInboundLinkMap.getCount(str3)));
            }
        }
        LOGGER.info("saving the result table, time elapsed: " + stopWatch.getElapsedTimeString());
        BufferedWriter bufferedWriter = null;
        try {
            try {
                bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2, true), "UTF-8"));
                bufferedWriter.append((CharSequence) "page;accessible;in-int;out-int;in-ext;out-ext;#words;size KB;indexed\n");
                for (Map.Entry<String, Map<String, Object>> entry : this.resultTable.entrySet()) {
                    bufferedWriter.append((CharSequence) (entry.getKey() + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("accessible") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("in-int") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("out-int") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("in-ext") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("out-ext") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get("#words") + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) (entry.getValue().get(XHtmlWriter.SIZE) + ClassificationUtils.DEFAULT_SEPARATOR));
                    bufferedWriter.append((CharSequence) FileHelper.NEWLINE_CHARACTER);
                }
                FileHelper.close(bufferedWriter);
            } catch (IOException e) {
                LOGGER.error("Exception while writing to {}", str2, e);
                FileHelper.close(bufferedWriter);
            }
        } catch (Throwable th) {
            FileHelper.close(bufferedWriter);
            throw th;
        }
    }

    public static void main(String[] strArr) {
        SitemapAnalyzer sitemapAnalyzer = new SitemapAnalyzer();
        sitemapAnalyzer.setNumThreads(10);
        sitemapAnalyzer.analyzeSitemap("http://webknox.com/sitemapIndex.xml", "sitemapAnalysis.csv");
    }
}
