package ws.palladian.retrieval;

import com.aliasi.util.Strings;
import edu.stanford.nlp.ling.CoreLabel;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import ws.palladian.helper.Callback;
import ws.palladian.helper.date.DateHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/Crawler.class */
public class Crawler {
    private static final Logger LOGGER = LoggerFactory.getLogger(Crawler.class);
    private int maxThreads = 10;
    private int threadCount = 0;
    private boolean inDomain = true;
    private boolean outDomain = true;
    private final Set<String> onlyFollow = new HashSet();
    private int stopCount = -1;
    private Set<String> urlStack = null;
    private final Set<String> visitedURLs = new HashSet();
    private final Set<String> seenURLs = new HashSet();
    private final Set<String> urlRules = new HashSet();
    private final Set<String> urlDump = new HashSet();
    private Callback crawlerCallbackOnFinish = null;
    private DocumentRetriever documentRetriever = new DocumentRetriever();

    protected void crawl(String str) {
        LOGGER.info("catch from stack: {}", str);
        Set<String> links = HtmlHelper.getLinks(this.documentRetriever.getWebDocument(str), this.inDomain, this.outDomain);
        LOGGER.info("\n\nretrieved {} links from {} || stack size: {} dump size: {}, visited: {}", Integer.valueOf(links.size()), str, Integer.valueOf(this.urlStack.size()), Integer.valueOf(this.urlDump.size()), Integer.valueOf(this.visitedURLs.size()));
        addURLsToStack(links, str);
    }

    public final void saveUrlDump(String str) {
        String str2 = ("URL crawl from " + DateHelper.getCurrentDatetime("dd.MM.yyyy") + " at " + DateHelper.getCurrentDatetime("HH:mm:ss") + FileHelper.NEWLINE_CHARACTER) + "Number of urls: " + this.urlDump.size() + "\n\n";
        FileHelper.writeToFile(str, this.urlDump);
    }

    private void startCrawl() {
        new ThreadGroup("crawler threads");
        while (!this.urlStack.isEmpty() && (this.stopCount == -1 || this.visitedURLs.size() < this.stopCount)) {
            int maxThreads = getMaxThreads();
            if (this.urlStack.size() <= this.maxThreads) {
                maxThreads = 1;
            }
            while (getThreadCount() >= maxThreads) {
                try {
                    Thread.sleep(2000L);
                } catch (InterruptedException e) {
                    LOGGER.warn(e.getMessage());
                    return;
                }
            }
            final String uRLFromStack = getURLFromStack();
            new Thread("CrawlThread" + System.currentTimeMillis()) { // from class: ws.palladian.retrieval.Crawler.1
                @Override // java.lang.Thread, java.lang.Runnable
                public void run() {
                    Crawler.this.crawl(uRLFromStack);
                    Crawler.this.decreaseThreadCount();
                }
            }.start();
            increaseThreadCount();
            int i = 0;
            while (this.urlStack.isEmpty() && getThreadCount() > 0 && i < 60) {
                try {
                    i++;
                    Thread.sleep(1000L);
                } catch (InterruptedException e2) {
                    LOGGER.warn(e2.getMessage());
                    return;
                }
            }
        }
        int i2 = 0;
        while (getThreadCount() > 0 && i2 < 180) {
            try {
                LOGGER.info("wait a second ({} more times)", Integer.valueOf(180 - i2));
                i2++;
                Thread.sleep(1000L);
            } catch (InterruptedException e3) {
                LOGGER.warn(e3.getMessage());
                return;
            }
        }
        if (this.crawlerCallbackOnFinish != null) {
            this.crawlerCallbackOnFinish.callback();
        }
    }

    public void startCrawl(Set<String> set, boolean z, boolean z2) {
        this.urlStack = set;
        this.inDomain = z;
        this.outDomain = z2;
        startCrawl();
    }

    public void startCrawl(String str, boolean z, boolean z2) {
        this.urlStack = new HashSet();
        this.urlStack.add(str);
        this.inDomain = z;
        this.outDomain = z2;
        startCrawl();
    }

    private synchronized String getURLFromStack() {
        String next = this.urlStack.iterator().next();
        removeURLFromStack(next);
        return next;
    }

    private synchronized void removeURLFromStack(String str) {
        this.urlStack.remove(str);
        this.visitedURLs.add(str);
    }

    public void setStopCount(int i) {
        this.stopCount = i;
    }

    public void addOnlyFollow(String str) {
        this.onlyFollow.add(str);
    }

    public void addURLRule(String str) {
        this.urlRules.add(str);
    }

    private synchronized void addURLsToStack(Set<String> set, String str) {
        Iterator<String> it = set.iterator();
        while (it.hasNext()) {
            addURLToStack(it.next(), str);
        }
    }

    private synchronized void addURLToStack(String str, String str2) {
        if (str == null || str.length() >= 400 || this.visitedURLs.contains(str)) {
            return;
        }
        boolean z = true;
        if (this.onlyFollow.size() > 0) {
            z = false;
            Iterator<String> it = this.onlyFollow.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                } else if (str.indexOf(it.next()) > -1) {
                    z = true;
                    break;
                }
            }
        }
        if (z) {
            this.urlStack.add(str);
        } else if (!this.seenURLs.contains(str)) {
            String trim = str2.replace(CoreLabel.TAG_SEPARATOR, Strings.SINGLE_SPACE_STRING).trim();
            if (checkURLRules(trim)) {
                this.urlDump.add(str + Strings.SINGLE_SPACE_STRING + trim);
            }
        }
        this.seenURLs.add(str);
    }

    private boolean checkURLRules(String str) {
        boolean z = false;
        Iterator<String> it = this.urlRules.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            String next = it.next();
            str = str.replace(CoreLabel.TAG_SEPARATOR, Strings.SINGLE_SPACE_STRING);
            if (str.indexOf(next) > 0) {
                z = true;
                break;
            }
        }
        return z;
    }

    public int getMaxThreads() {
        return this.maxThreads;
    }

    public void setMaxThreads(int i) {
        this.maxThreads = i;
    }

    public int getThreadCount() {
        return this.threadCount;
    }

    public void increaseThreadCount() {
        this.threadCount++;
    }

    public void decreaseThreadCount() {
        this.threadCount--;
    }

    public Callback getCrawlerCallbackOnFinish() {
        return this.crawlerCallbackOnFinish;
    }

    public void setCrawlerCallbackOnFinish(Callback callback) {
        this.crawlerCallbackOnFinish = callback;
    }

    public void addCrawlerCallback(RetrieverCallback<Document> retrieverCallback) {
        this.documentRetriever.addRetrieverCallback(retrieverCallback);
    }

    public DocumentRetriever getDocumentRetriever() {
        return this.documentRetriever;
    }

    public void setDocumentRetriever(DocumentRetriever documentRetriever) {
        this.documentRetriever = documentRetriever;
    }

    public static void main(String[] strArr) {
        Crawler crawler = new Crawler();
        crawler.addCrawlerCallback(new RetrieverCallback<Document>() { // from class: ws.palladian.retrieval.Crawler.2
            @Override // ws.palladian.retrieval.RetrieverCallback
            public void onFinishRetrieval(Document document) {
                Crawler.LOGGER.info("downloaded the page " + document.getDocumentURI());
            }
        });
        crawler.setStopCount(1000);
        crawler.setMaxThreads(1);
        crawler.startCrawl("http://www.dmoz.org/", true, true);
    }
}
