package ws.palladian.extraction.location.evaluation;

import edu.stanford.nlp.ling.CoreLabel;
import java.io.File;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import ws.palladian.extraction.entity.ContextAnnotation;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.extraction.entity.TaggingFormat;
import ws.palladian.extraction.location.GeoCoordinate;
import ws.palladian.extraction.location.LocationExtractorUtils;
import ws.palladian.extraction.location.LocationType;
import ws.palladian.extraction.token.Tokenizer;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.collection.CountMap;
import ws.palladian.helper.collection.Factory;
import ws.palladian.helper.collection.LazyMap;
import ws.palladian.helper.io.FileHelper;
import ws.palladian.helper.math.MathHelper;
import ws.palladian.helper.nlp.StringHelper;

/* loaded from: input_file:lib/palladian.jar:ws/palladian/extraction/location/evaluation/DatasetCheck.class */
final class DatasetCheck {
    private static final Pattern TAG_REGEX = Pattern.compile("<([^>]*)>([^<]*)<(/?)([^>]*)>");
    private static final Set<String> allowedTags = CollectionHelper.newHashSet();

    DatasetCheck() {
    }

    static void performCheck(File file) {
        if (!file.isDirectory()) {
            throw new IllegalStateException("Specified path '" + file + "' does not exist or is no directory.");
        }
        File[] files = FileHelper.getFiles(file.getPath(), "text");
        if (files.length == 0) {
            throw new IllegalStateException("No text files found in '" + file + "'");
        }
        LazyMap create = LazyMap.create(new Factory<CountMap<String>>() { // from class: ws.palladian.extraction.location.evaluation.DatasetCheck.1
            /* JADX WARN: Can't rename method to resolve collision */
            @Override // ws.palladian.helper.collection.Factory
            public CountMap<String> create() {
                return CountMap.create();
            }
        });
        int i = 0;
        for (File file2 : files) {
            String absolutePath = file2.getAbsolutePath();
            String name = file2.getName();
            String readFileToString = FileHelper.readFileToString(absolutePath);
            Matcher matcher = TAG_REGEX.matcher(readFileToString);
            LazyMap create2 = LazyMap.create(new Factory<Set<String>>() { // from class: ws.palladian.extraction.location.evaluation.DatasetCheck.2
                /* JADX WARN: Can't rename method to resolve collision */
                @Override // ws.palladian.helper.collection.Factory
                public Set<String> create() {
                    return CollectionHelper.newHashSet();
                }
            });
            i += Tokenizer.tokenize(FileFormatParser.getText(absolutePath, TaggingFormat.XML)).size();
            while (matcher.find()) {
                String group = matcher.group(1);
                if (group.contains("role=\"main\"")) {
                    group = group.substring(0, group.indexOf("role=\"main\"")).trim();
                }
                String group2 = matcher.group(2);
                String group3 = matcher.group(3);
                String group4 = matcher.group(4);
                if (!CoreLabel.TAG_SEPARATOR.equals(group3)) {
                    System.out.println("[error] " + group4 + " does not start with '/' in " + name);
                }
                if (!group.equals(group4)) {
                    System.out.println("[error] " + group + " does not match " + group4 + " in " + name);
                }
                if (!allowedTags.contains(group)) {
                    System.out.println("[error] unknown tag " + group + " in " + name);
                }
                if (group2.length() > 50) {
                    System.out.println("[warn] " + group2 + " seems rather long for an annotation in " + name);
                }
                if (StringHelper.isPunctuation(group2.charAt(0))) {
                    System.out.println("[warn] '" + group2 + "' starts with punctuation in " + name);
                }
                if (StringHelper.isPunctuation(group2.charAt(group2.length() - 1))) {
                    System.out.println("[warn] '" + group2 + "' ends with punctuation in " + name);
                }
                if (Character.isWhitespace(group2.charAt(0))) {
                    System.out.println("[warn] '" + group2 + "' starts with white space in " + name);
                }
                if (Character.isWhitespace(group2.charAt(group2.length() - 1))) {
                    System.out.println("[warn] '" + group2 + "' ends with white space in " + name);
                }
                ((Set) create2.get(group2)).add(group);
                ((CountMap) create.get(group)).add(group2);
            }
            for (String str : create2.keySet()) {
                if (((Set) create2.get(str)).size() > 1) {
                    System.out.println("[warn] ambiguous annotations for " + str + ": " + create2.get(str) + " in " + name);
                }
            }
            for (String str2 : create2.keySet()) {
                for (String str3 : (Set) create2.get(str2)) {
                    Matcher matcher2 = Pattern.compile(String.format("(?<!<%s>)(?<=[\\s\"])%s(?!</%s>)(?=[\\s.,:;?!])", str3, Pattern.quote(str2), str3)).matcher(readFileToString);
                    while (matcher2.find()) {
                        System.out.println("[warn] potentially missed annotation for '" + str2 + "' (context '" + readFileToString.substring(Math.max(0, matcher2.start() - 15), Math.min(readFileToString.length(), matcher2.end() + 15)).replace('\n', ' ') + "' in " + name);
                    }
                }
            }
            if (create2.isEmpty()) {
                System.out.println("[warn] no annotations in " + name);
            }
        }
        System.out.println('\n');
        System.out.println("Assigned tags:");
        int i2 = 0;
        int i3 = 0;
        for (String str4 : create.keySet()) {
            int i4 = ((CountMap) create.get(str4)).totalSize();
            int uniqueSize = ((CountMap) create.get(str4)).uniqueSize();
            System.out.println(str4 + " total: " + i4 + ", unique: " + uniqueSize);
            i2 += i4;
            i3 += uniqueSize;
        }
        System.out.println();
        System.out.println("# total: " + i2);
        System.out.println("# unique: " + i3);
        System.out.println("# tokens: " + i);
        System.out.println();
        System.out.println("# texts: " + files.length);
    }

    static void getNonDisambiguatedStatistics(File file) {
        Map<String, Map<Integer, GeoCoordinate>> readCoordinates = LocationExtractorUtils.readCoordinates(new File(file, "coordinates.csv"));
        CountMap create = CountMap.create();
        CountMap create2 = CountMap.create();
        for (File file2 : FileHelper.getFiles(file.getPath(), "text")) {
            Iterator<T> it = FileFormatParser.getAnnotationsFromXmlText(FileHelper.readFileToString(file2).replace(" role=\"main\"", "")).iterator();
            while (it.hasNext()) {
                ContextAnnotation contextAnnotation = (ContextAnnotation) it.next();
                create.add(contextAnnotation.getTag());
                if (readCoordinates.get(file2.getName()).get(Integer.valueOf(contextAnnotation.getStartPosition())) != null) {
                    create2.add(contextAnnotation.getTag());
                }
            }
        }
        for (String str : create2.keySet()) {
            System.out.println(str + " total: " + create.getCount(str) + ", disambiguated: " + create2.getCount(str) + ", percentage: " + MathHelper.round((r0 / r0) * 100.0f, 2));
        }
        System.out.println();
        System.out.println("# total disambiguated: " + create2.totalSize());
        System.out.println("% total disambiguated: " + MathHelper.round((create2.totalSize() / create.totalSize()) * 100.0f, 2));
    }

    public static void main(String[] strArr) {
        performCheck(new File("/Users/pk/Dropbox/Uni/Datasets/TUD-Loc-2013/TUD-Loc-2013_V2"));
    }

    static {
        for (LocationType locationType : LocationType.values()) {
            allowedTags.add(locationType.toString());
        }
    }
}
