package ws.palladian.extraction.feature;

import org.apache.commons.lang3.StringEscapeUtils;
import ws.palladian.helper.html.HtmlHelper;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.processing.TextDocument;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/feature/HtmlCleaner.class */
public final class HtmlCleaner extends TextDocumentPipelineProcessor {
    @Override // ws.palladian.extraction.feature.TextDocumentPipelineProcessor
    public final void processDocument(TextDocument textDocument) {
        textDocument.setContent(StringEscapeUtils.unescapeHtml4(HtmlHelper.stripHtmlTags(textDocument.getContent()).replaceAll("<br\\s*/?>", FileHelper.NEWLINE_CHARACTER)));
    }
}
