package ws.palladian.extraction.content;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.StringInputStream;
import ws.palladian.retrieval.HttpException;
import ws.palladian.retrieval.HttpRetriever;
import ws.palladian.retrieval.HttpRetrieverFactory;
import ws.palladian.retrieval.feeds.FeedItem;
import ws.palladian.retrieval.feeds.parser.FeedParserException;
import ws.palladian.retrieval.feeds.parser.RomeFeedParser;
import ws.palladian.retrieval.parser.ParserException;
import ws.palladian.retrieval.parser.ParserFactory;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/content/FiveFiltersContentExtractor.class */
public class FiveFiltersContentExtractor extends WebPageContentExtractor {
    private static final Logger LOGGER = LoggerFactory.getLogger(FiveFiltersContentExtractor.class);
    private Node resultNode = null;
    private String extractedTitle = "";
    private String extractedResult = "";
    private final HttpRetriever httpRetriever = HttpRetrieverFactory.getHttpRetriever();

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(String str) throws PageContentExtractorException {
        try {
            this.extractedResult = this.httpRetriever.httpGet(buildRequestUrl(str)).getStringContent();
            try {
                FeedItem feedItem = new RomeFeedParser().getFeed(new StringInputStream(this.extractedResult)).getItems().get(0);
                this.extractedResult = feedItem.getDescription();
                this.extractedTitle = feedItem.getTitle();
                try {
                    this.resultNode = ParserFactory.createHtmlParser().parse(new StringInputStream(this.extractedResult));
                    this.extractedResult = HtmlHelper.documentToReadableText(this.resultNode);
                    this.extractedResult = this.extractedResult.replaceAll("This entry passed through the Full-Text RSS service.*", "");
                } catch (ParserException e) {
                    e.printStackTrace();
                }
            } catch (FeedParserException e2) {
                LOGGER.error(e2.getMessage());
            }
            return this;
        } catch (HttpException e3) {
            throw new PageContentExtractorException("Error when contacting API for URL \"" + str + "\": " + e3.getMessage(), e3);
        }
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public WebPageContentExtractor setDocument(Document document) throws PageContentExtractorException {
        return setDocument(document.getDocumentURI());
    }

    private String buildRequestUrl(String str) {
        return String.format("http://ftr.fivefilters.org/makefulltextfeed.php?url=%s&max=1", UrlHelper.encodeParameter(str));
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public Node getResultNode() {
        return this.resultNode;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultText() {
        return this.extractedResult;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getResultTitle() {
        return this.extractedTitle;
    }

    @Override // ws.palladian.extraction.content.WebPageContentExtractor
    public String getExtractorName() {
        return "FiveFilters Content Extractor";
    }

    public static void main(String[] strArr) {
        FiveFiltersContentExtractor fiveFiltersContentExtractor = new FiveFiltersContentExtractor();
        String resultText = fiveFiltersContentExtractor.getResultText("http://travel.cnn.com/Shanghai-joins-Beijing-visa-free-travel-864436");
        System.out.println("title: " + fiveFiltersContentExtractor.getResultTitle());
        System.out.println("text: " + resultText);
    }
}
