package de.unihd.dbs.uima.annotator.alllanguagestokenizer;

import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.internal.util.Misc;
import org.apache.uima.jcas.JCas;

/* loaded from: input_file:BOOT-INF/lib/heideltime-2.2.1.jar:de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.class */
public class AllLanguagesTokenizer extends JCasAnnotator_ImplBase {
    private String PChar = "\\[¿¡\\{\\(\\`\"‚„†‡‹‘’“”•–—›'";
    private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%‚„…†‡‰‹‘’“”•–—›";
    private String FClitic = "";
    private String PClitic = "";

    public AllLanguagesTokenizer() {
        this.FClitic += "'(s|re|ve|d|m|em|ll)|n't";
        this.PClitic += "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
        this.PClitic += "|[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
        this.FClitic += "|-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
        this.FClitic += "|-la|-las|-lo|-los|-nos";
    }

    @Override // org.apache.uima.analysis_component.JCasAnnotator_ImplBase
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        tokenize(jCas);
        sentenceTokenize(jCas);
    }

    public List<Token> tokenize(JCas jCas) {
        Boolean bool;
        StringBuilder sb = new StringBuilder();
        for (String str : jCas.getDocumentText().split("\n")) {
            for (String str2 : str.replaceAll("[\r\n\t]", " ").replaceAll("(<[^<> ]*) ([^<>]*>)", "$1ÿ$2").replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "þ").replaceAll("ÿ", " ").replaceAll("þ", "ÿ").replaceAll("(<[^<>]*>)", "ÿ$1ÿ").replaceAll("^ÿ", "").replaceAll("ÿ$", "").replaceAll("ÿÿÿ*", "ÿ").split("ÿ")) {
                if (str2.matches("^<.*>$")) {
                    sb.append(str2 + "\n");
                } else {
                    String replaceAll = (" " + str2 + " ").replaceAll("\\.\\.\\.", " ... ").replaceAll("([;\\!\\?])([^ ])", "$1 $2").replaceAll("([.,:])([^ 0-9.])", "$1 $2");
                    for (String str3 : replaceAll.split(" ")) {
                        if (!str3.equals("")) {
                            String str4 = "";
                            do {
                                bool = true;
                                Matcher matcher = Pattern.compile("^([" + this.PChar + "])(.)").matcher(str3);
                                if (matcher.find()) {
                                    str3 = str3.replaceAll("^([" + this.PChar + "])(.)", "$2");
                                    sb.append(matcher.group(1) + "\n");
                                    bool = false;
                                }
                                Matcher matcher2 = Pattern.compile("(.)([" + this.FChar + "])$").matcher(str3);
                                if (matcher2.find()) {
                                    str3 = str3.replaceAll("(.)([" + this.FChar + "])$", "$1");
                                    str4 = matcher2.group(2) + "\n" + str4;
                                    bool = false;
                                }
                                Matcher matcher3 = Pattern.compile("([" + this.FChar + "])\\.$").matcher(str3);
                                if (matcher3.find()) {
                                    str3 = str3.replaceAll("([" + this.FChar + "])\\.$", "");
                                    str4 = ".\n" + str4;
                                    if (str3.equals("")) {
                                        str3 = matcher3.group(1);
                                    } else {
                                        str4 = matcher3.group(1) + "\n" + str4;
                                    }
                                    bool = false;
                                }
                            } while (!bool.booleanValue());
                            if (str3.matches("^([A-Za-z-]\\.)+$")) {
                                sb.append(str3 + "\n" + str4);
                            } else {
                                Matcher matcher4 = Pattern.compile("^(..*)\\.$").matcher(str3);
                                if (matcher4.matches() && !replaceAll.equals(Misc.dots)) {
                                    str3 = matcher4.group(1);
                                    str4 = ".\n" + str4;
                                }
                                while (true) {
                                    Matcher matcher5 = Pattern.compile("^(--)(.)").matcher(str3);
                                    if (!matcher5.find()) {
                                        break;
                                    }
                                    str3 = str3.replaceAll("^(--)(.)", "$2");
                                    sb.append(matcher5.group(1) + "\n");
                                }
                                if (!this.PClitic.equals("")) {
                                    while (true) {
                                        Matcher matcher6 = Pattern.compile("^(" + this.PClitic + ")(.)").matcher(str3);
                                        if (!matcher6.find()) {
                                            break;
                                        }
                                        str3 = str3.replaceAll("^(" + this.PClitic + ")(.)", "$2");
                                        sb.append(matcher6.group(1) + "\n");
                                    }
                                }
                                while (true) {
                                    Matcher matcher7 = Pattern.compile("^(--)(.)").matcher(str3);
                                    if (!matcher7.find()) {
                                        break;
                                    }
                                    str3 = str3.replaceAll("^(--)(.)", "$1");
                                    str4 = matcher7.group(2) + "\n" + str4;
                                }
                                if (!this.FClitic.equals("")) {
                                    while (true) {
                                        Matcher matcher8 = Pattern.compile("(.)(" + this.FClitic + ")$").matcher(str3);
                                        if (!matcher8.find()) {
                                            break;
                                        }
                                        str3 = str3.replaceAll("(.)(" + this.FClitic + ")$", "$1");
                                        str4 = matcher8.group(2) + "\n" + str4;
                                    }
                                }
                                sb.append(str3 + "\n" + str4);
                            }
                        }
                    }
                }
            }
        }
        LinkedList linkedList = new LinkedList();
        String documentText = jCas.getDocumentText();
        Integer num = 0;
        for (String str5 : sb.toString().split("\n")) {
            Integer valueOf = Integer.valueOf(documentText.indexOf(str5, num.intValue()));
            Integer valueOf2 = Integer.valueOf(valueOf.intValue() + str5.length());
            Token token = new Token(jCas);
            token.setBegin(valueOf.intValue());
            token.setPos("");
            token.setEnd(valueOf2.intValue());
            token.addToIndexes();
            num = Integer.valueOf(token.getEnd());
            linkedList.add(token);
        }
        return linkedList;
    }

    public List<Sentence> sentenceTokenize(JCas jCas) {
        LinkedList linkedList = new LinkedList();
        FSIterator<T> it = jCas.getAnnotationIndex(Token.type).iterator();
        Sentence sentence = new Sentence(jCas);
        Boolean bool = false;
        Token token = null;
        Token token2 = null;
        while (it.hasNext()) {
            if (token2 != null) {
                token = token2;
            }
            token2 = (Token) it.next();
            if (!bool.booleanValue()) {
                bool = true;
                sentence.setBegin(token2.getBegin());
            }
            if (!it.hasNext() || (token2.getCoveredText().matches("[.:!\\?]+") && !token.getCoveredText().matches("[\\d]+") && (jCas.getDocumentText().substring(token2.getEnd()).length() <= 2 || !jCas.getDocumentText().substring(token2.getEnd(), token2.getEnd() + 3).matches(" [A-Z][.-]")))) {
                bool = false;
                sentence.setEnd(token2.getEnd());
                if (it.hasNext()) {
                    Token token3 = (Token) it.next();
                    if (token3.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄＂＇｣﹂]+")) {
                        sentence.setEnd(token3.getEnd());
                    } else {
                        it.moveToPrevious();
                    }
                }
                sentence.addToIndexes();
                linkedList.add(sentence);
                sentence = new Sentence(jCas);
            }
        }
        return linkedList;
    }
}
