package ws.palladian.retrieval.feeds.discovery;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ws.palladian.helper.UrlHelper;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.Factory;
import ws.palladian.helper.collection.LazyMap;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.io.LineAction;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/retrieval/feeds/discovery/FeedUrlsNearDuplicateEliminator.class */
public class FeedUrlsNearDuplicateEliminator {
    private static final String START_PATTERN = "(?<!\\w)";
    private static final String STOP_PATTERN = "(?!\\w)";
    private static Pattern formatPattern;
    private static final String FORMAT_PLACEHOLDER = "###FORMAT###";
    private static final Logger LOGGER = LoggerFactory.getLogger(FeedUrlsNearDuplicateEliminator.class);
    private static final String[] ATOM = {"atom10", "atom1.0", "atom_1.0", "atom_10", "atom"};
    private static final String[] RSS = {"rss_2.0", "rss_200", "rss_20", "rss2.0", "rss200", "rss20", "rss2", "rss"};
    private static final String[] FORMATS = (String[]) ArrayUtils.addAll(ATOM, RSS);
    private static final Pattern IGNORE_PATTERN = Pattern.compile("sessionid|PHPSESSID", 2);

    private static void compilePattern() {
        StringBuilder sb = new StringBuilder();
        sb.append(START_PATTERN).append("(");
        for (String str : FORMATS) {
            sb.append(str).append("|");
        }
        sb.deleteCharAt(sb.length() - 1);
        sb.append(")").append(STOP_PATTERN);
        LOGGER.debug(sb.toString());
        formatPattern = Pattern.compile(sb.toString(), 2);
    }

    public static List<String> deDuplicate(Collection<String> collection) {
        ArrayList arrayList = new ArrayList();
        LazyMap create = LazyMap.create(new Factory<Collection<String>>() { // from class: ws.palladian.retrieval.feeds.discovery.FeedUrlsNearDuplicateEliminator.1
            /* JADX WARN: Can't rename method to resolve collision */
            @Override // ws.palladian.helper.collection.Factory
            public Collection<String> create() {
                return CollectionHelper.newArrayList();
            }
        });
        for (String str : collection) {
            if (!IGNORE_PATTERN.matcher(str).find()) {
                String str2 = null;
                String trim = str.trim();
                LOGGER.debug("link : " + trim);
                Matcher matcher = formatPattern.matcher(trim);
                int i = 0;
                while (matcher.find()) {
                    i++;
                }
                matcher.reset();
                if (i > 1) {
                    LOGGER.error("Found too many feed formats in : {} - can't deduplicate.", trim);
                } else if (i == 1) {
                    while (matcher.find()) {
                        str2 = matcher.group();
                        LOGGER.debug("   format : " + str2);
                        trim = trim.replaceAll(formatPattern.toString(), FORMAT_PLACEHOLDER);
                    }
                }
                if (str2 != null) {
                    ((Collection) create.get(trim)).add(str2);
                } else {
                    arrayList.add(trim);
                }
            }
        }
        Iterator it = create.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            String str3 = (String) entry.getKey();
            Collection collection2 = (Collection) entry.getValue();
            String[] strArr = FORMATS;
            int length = strArr.length;
            int i2 = 0;
            while (true) {
                if (i2 < length) {
                    String str4 = strArr[i2];
                    if (collection2.contains(str4)) {
                        arrayList.add(str3.replace(FORMAT_PLACEHOLDER, str4));
                        break;
                    }
                    i2++;
                }
            }
        }
        ListIterator listIterator = arrayList.listIterator();
        while (listIterator.hasNext()) {
            String str5 = (String) listIterator.next();
            ListIterator listIterator2 = arrayList.listIterator();
            while (true) {
                if (listIterator2.hasNext()) {
                    String str6 = (String) listIterator2.next();
                    if (!str6.equals(str5) && str6.indexOf(str5) != -1) {
                        listIterator.remove();
                        break;
                    }
                }
            }
        }
        HashSet hashSet = new HashSet();
        ListIterator listIterator3 = arrayList.listIterator();
        while (listIterator3.hasNext()) {
            if (!hashSet.add(((String) listIterator3.next()).toLowerCase())) {
                listIterator3.remove();
            }
        }
        return arrayList;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void appendFile(String str, Collection<String> collection) {
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            sb.append(it.next()).append(FileHelper.NEWLINE_CHARACTER);
        }
        FileHelper.appendFile(str, sb.toString());
    }

    public static void main(String[] strArr) {
        final LinkedList linkedList = new LinkedList();
        LineAction lineAction = new LineAction() { // from class: ws.palladian.retrieval.feeds.discovery.FeedUrlsNearDuplicateEliminator.2
            String domain = null;

            @Override // ws.palladian.helper.io.LineAction
            public void performAction(String str, int i) {
                if (i % 10000 == 0) {
                    FeedUrlsNearDuplicateEliminator.LOGGER.info(i + " lines processed.");
                }
                String domain = UrlHelper.getDomain(str);
                if ((domain.equalsIgnoreCase(this.domain) || this.domain == null) ? false : true) {
                    FeedUrlsNearDuplicateEliminator.appendFile("F:\\Konferenzen und Meetings\\papers_(eigene)\\2011_feedDatasetPaper\\gathering_TUDCS6\\foundFeedsDeduplicatedSortedRemovedUnreachableAndNearDuplicates.txt", FeedUrlsNearDuplicateEliminator.deDuplicate(linkedList));
                    linkedList.clear();
                }
                linkedList.add(str);
                this.domain = domain;
            }
        };
        FileHelper.delete("F:\\Konferenzen und Meetings\\papers_(eigene)\\2011_feedDatasetPaper\\gathering_TUDCS6\\foundFeedsDeduplicatedSortedRemovedUnreachableAndNearDuplicates.txt");
        FileHelper.performActionOnEveryLine("F:\\Konferenzen und Meetings\\papers_(eigene)\\2011_feedDatasetPaper\\gathering_TUDCS6\\foundFeedsDeduplicatedSortedReachable.txt", lineAction);
        appendFile("F:\\Konferenzen und Meetings\\papers_(eigene)\\2011_feedDatasetPaper\\gathering_TUDCS6\\foundFeedsDeduplicatedSortedRemovedUnreachableAndNearDuplicates.txt", deDuplicate(linkedList));
    }

    static {
        compilePattern();
    }
}
