package de.unihd.dbs.uima.annotator.treetagger;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.internal.util.Misc;

/* loaded from: input_file:BOOT-INF/lib/heideltime-2.2.1.jar:de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer.class */
public class TreeTaggerTokenizer {
    EnumSet<Flag> flags;
    private File abbreviationsFile;
    private String FClitic;
    private String PClitic;
    private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'";
    private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›";
    private ArrayList<String> abbreviations = new ArrayList<>();

    /* loaded from: input_file:BOOT-INF/lib/heideltime-2.2.1.jar:de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer$Flag.class */
    public enum Flag {
        ENGLISH,
        FRENCH,
        ITALIAN,
        GALICIAN,
        Z;

        public static EnumSet<Flag> getSet(String str) {
            EnumSet<Flag> noneOf = EnumSet.noneOf(Flag.class);
            if (str == null) {
                return noneOf;
            }
            if (str.contains("-e")) {
                noneOf.add(ENGLISH);
            }
            if (str.contains("-f")) {
                noneOf.add(FRENCH);
            }
            if (str.contains("-i")) {
                noneOf.add(ITALIAN);
            }
            if (str.contains("-g")) {
                noneOf.add(GALICIAN);
            }
            if (str.contains("-z")) {
                noneOf.add(Z);
            }
            return noneOf;
        }
    }

    public TreeTaggerTokenizer(String str, EnumSet<Flag> enumSet) throws RuntimeException {
        this.flags = null;
        this.abbreviationsFile = null;
        this.FClitic = "";
        this.PClitic = "";
        this.flags = enumSet;
        if (str != null) {
            this.abbreviationsFile = new File(str);
            if (!this.abbreviationsFile.exists() || !this.abbreviationsFile.canRead()) {
                Logger.printError(getClass(), "Couldn't read abbreviations file " + str + " (exist:" + this.abbreviationsFile.exists() + ",read:" + this.abbreviationsFile.canRead() + ")");
                throw new RuntimeException();
            }
            BufferedReader bufferedReader = null;
            try {
                try {
                    bufferedReader = new BufferedReader(new FileReader(this.abbreviationsFile));
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        String replaceAll = readLine.replaceAll("^[ \t\r\n]+", "").replaceAll("[ \t\r\n]+$", "");
                        if (!replaceAll.matches("^(#.*|\\s$)")) {
                            this.abbreviations.add(replaceAll);
                        }
                    }
                    if (bufferedReader != null) {
                        try {
                            bufferedReader.close();
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }
                } catch (Exception e2) {
                    e2.printStackTrace();
                    if (bufferedReader != null) {
                        try {
                            bufferedReader.close();
                        } catch (Exception e3) {
                            e3.printStackTrace();
                        }
                    }
                }
            } catch (Throwable th) {
                if (bufferedReader != null) {
                    try {
                        bufferedReader.close();
                    } catch (Exception e4) {
                        e4.printStackTrace();
                    }
                }
                throw th;
            }
        }
        if (enumSet.contains(Flag.ENGLISH)) {
            this.FClitic = "'(s|re|ve|d|m|em|ll)|n't";
        }
        if (enumSet.contains(Flag.ITALIAN)) {
            this.PClitic = "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
        }
        if (enumSet.contains(Flag.FRENCH)) {
            this.PClitic = "[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
            this.FClitic = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
        }
        if (enumSet.contains(Flag.GALICIAN)) {
            this.FClitic = "-la|-las|-lo|-los|-nos";
        }
    }

    public List<String> tokenize(String str) {
        Boolean bool;
        StringBuilder sb = new StringBuilder();
        for (String str2 : str.split("\n")) {
            for (String str3 : str2.replaceAll("[\r\n\t]", " ").replaceAll("(<[^<> ]*) ([^<>]*>)", "$1ÿ$2").replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "þ").replaceAll("ÿ", " ").replaceAll("þ", "ÿ").replaceAll("(<[^<>]*>)", "ÿ$1ÿ").replaceAll("^ÿ", "").replaceAll("ÿ$", "").replaceAll("ÿÿÿ*", "ÿ").split("ÿ")) {
                if (str3.matches("^<.*>$")) {
                    sb.append(str3 + "\n");
                } else {
                    String replaceAll = (" " + str3 + " ").replaceAll("\\.\\.\\.", " ... ").replaceAll("([;\\!\\?])([^ ])", "$1 $2").replaceAll("([.,:])([^ 0-9.])", "$1 $2");
                    for (String str4 : replaceAll.split(" ")) {
                        if (!str4.equals("")) {
                            String str5 = "";
                            do {
                                bool = true;
                                Matcher matcher = Pattern.compile("^([" + this.PChar + "])(.)").matcher(str4);
                                if (matcher.find()) {
                                    str4 = str4.replaceAll("^([" + this.PChar + "])(.)", "$2");
                                    sb.append(matcher.group(1) + "\n");
                                    bool = false;
                                }
                                Matcher matcher2 = Pattern.compile("(.)([" + this.FChar + "])$").matcher(str4);
                                if (matcher2.find()) {
                                    str4 = str4.replaceAll("(.)([" + this.FChar + "])$", "$1");
                                    str5 = matcher2.group(2) + "\n" + str5;
                                    bool = false;
                                }
                                Matcher matcher3 = Pattern.compile("([" + this.FChar + "])\\.$").matcher(str4);
                                if (matcher3.find()) {
                                    str4 = str4.replaceAll("([" + this.FChar + "])\\.$", "");
                                    str5 = ".\n" + str5;
                                    if (str4.equals("")) {
                                        str4 = matcher3.group(1);
                                    } else {
                                        str5 = matcher3.group(1) + "\n" + str5;
                                    }
                                    bool = false;
                                }
                            } while (!bool.booleanValue());
                            if (this.abbreviations.contains(str4)) {
                                sb.append(str4 + "\n" + str5);
                            } else if (str4.matches("^([A-Za-z-]\\.)+$")) {
                                sb.append(str4 + "\n" + str5);
                            } else {
                                Matcher matcher4 = Pattern.compile("^(..*)\\.$").matcher(str4);
                                if (matcher4.matches() && !replaceAll.equals(Misc.dots) && (!this.flags.contains(Flag.GALICIAN) || !str4.matches("^[0-9]+\\.$"))) {
                                    str4 = matcher4.group(1);
                                    str5 = ".\n" + str5;
                                    if (this.abbreviations.contains(str4)) {
                                        sb.append(str4 + "\n" + str5);
                                    }
                                }
                                while (true) {
                                    Matcher matcher5 = Pattern.compile("^(--)(.)").matcher(str4);
                                    if (!matcher5.find()) {
                                        break;
                                    }
                                    str4 = str4.replaceAll("^(--)(.)", "$2");
                                    sb.append(matcher5.group(1) + "\n");
                                }
                                if (!this.PClitic.equals("")) {
                                    while (true) {
                                        Matcher matcher6 = Pattern.compile("^(" + this.PClitic + ")(.)").matcher(str4);
                                        if (!matcher6.find()) {
                                            break;
                                        }
                                        str4 = str4.replaceAll("^(" + this.PClitic + ")(.)", "$2");
                                        sb.append(matcher6.group(1) + "\n");
                                    }
                                }
                                while (true) {
                                    Matcher matcher7 = Pattern.compile("(.)(--)$").matcher(str4);
                                    if (!matcher7.find()) {
                                        break;
                                    }
                                    str4 = str4.replaceAll("(.)(--)$", "$1");
                                    str5 = matcher7.group(2) + "\n" + str5;
                                }
                                if (!this.FClitic.equals("")) {
                                    while (true) {
                                        Matcher matcher8 = Pattern.compile("(.)(" + this.FClitic + ")$").matcher(str4);
                                        if (!matcher8.find()) {
                                            break;
                                        }
                                        str4 = str4.replaceAll("(.)(" + this.FClitic + ")$", "$1");
                                        str5 = matcher8.group(2) + "\n" + str5;
                                    }
                                }
                                sb.append(str4 + "\n" + str5);
                            }
                        }
                    }
                }
            }
        }
        LinkedList linkedList = new LinkedList();
        for (String str6 : sb.toString().split("\n")) {
            linkedList.add(str6.replaceAll("^[\\p{javaWhitespace}\\p{gc=Cc}]+", "").replaceAll("[\\p{javaWhitespace}\\p{gc=Cc}]+$", ""));
        }
        return linkedList;
    }
}
