package edu.stanford.nlp.pipeline;

import com.ibm.icu.text.PluralRules;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;
import edu.usc.ir.sentiment.analysis.cmdline.SentimentConstant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.pdfbox.contentstream.operator.OperatorName;

/* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/pipeline/ChineseSegmenterAnnotator.class */
public class ChineseSegmenterAnnotator implements Annotator {
    private static final String DEFAULT_MODEL_NAME = "segment";
    private static final String DEFAULT_SEG_LOC = "/u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/ctb7.chris6.lex.gz";
    private static final String DEFAULT_SER_DICTIONARY = "//u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/dict-chris6.ser.gz";
    private static final String DEFAULT_SIGHAN_CORPORA_DICT = "/u/nlp/data/chinese-segmenter/stanford-seg-2010/releasedata/";
    private final AbstractSequenceClassifier<?> segmenter;
    private final boolean VERBOSE;
    private final boolean tokenizeNewline;
    private final boolean sentenceSplitOnTwoNewlines;
    private final boolean normalizeSpace;
    private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseSegmenterAnnotator.class);
    private static final String separator = "\\R";
    private static final Pattern separatorPattern = Pattern.compile(separator);
    private static final Pattern xmlPattern = Pattern.compile("<([!?][A-Za-z-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:.-]*([ ]+([A-Za-z][A-Za-z0-9_:.-]*|[A-Za-z][A-Za-z0-9_:.-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z][A-Za-z0-9_:.-]*)))*[ ]*/?|/[A-Za-z][A-Za-z0-9_:.-]*)[ ]*>");

    public ChineseSegmenterAnnotator() {
        this(DEFAULT_SEG_LOC, false);
    }

    public ChineseSegmenterAnnotator(String str, boolean z) {
        this(str, z, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
    }

    public ChineseSegmenterAnnotator(String str, boolean z, String str2, String str3) {
        this(DEFAULT_MODEL_NAME, PropertiesUtils.asProperties("segment.serDictionary", str2, "segment.sighanCorporaDict", str3, "segment.verbose", Boolean.toString(z), "segment.model", str));
    }

    public ChineseSegmenterAnnotator(String str, Properties properties) {
        String str2 = null;
        Properties properties2 = new Properties();
        String str3 = str + '.';
        for (String str4 : properties.stringPropertyNames()) {
            if (str4.startsWith(str3)) {
                String substring = str4.substring(str3.length());
                if (substring.equals(SentimentConstant.MODEL)) {
                    str2 = properties.getProperty(str4);
                } else {
                    properties2.setProperty(substring, properties.getProperty(str4));
                }
            }
        }
        this.VERBOSE = PropertiesUtils.getBool(properties, str + ".verbose", false);
        this.normalizeSpace = PropertiesUtils.getBool(properties, str + ".normalizeSpace", false);
        if (str2 == null) {
            throw new RuntimeException("Expected a property " + str + ".model");
        }
        if (this.VERBOSE) {
            log.info("Loading Segmentation Model ... ");
        }
        try {
            this.segmenter = CRFClassifier.getClassifier(str2, properties2);
            this.tokenizeNewline = !properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "never").equals("never") || Boolean.valueOf(properties.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false")).booleanValue();
            this.sentenceSplitOnTwoNewlines = properties.getProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "never").equals(PluralRules.KEYWORD_TWO);
        } catch (RuntimeException e) {
            throw e;
        } catch (Exception e2) {
            throw new RuntimeException(e2);
        }
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public void annotate(Annotation annotation) {
        if (this.VERBOSE) {
            log.info("Adding Segmentation annotation ... ");
        }
        List list = (List) annotation.get(CoreAnnotations.SentencesAnnotation.class);
        if (list == null) {
            doOneSentence(annotation);
            return;
        }
        Iterator it = list.iterator();
        while (it.hasNext()) {
            doOneSentence((CoreMap) it.next());
        }
    }

    private void doOneSentence(CoreMap coreMap) {
        splitCharacters(coreMap);
        runSegmentation(coreMap);
    }

    private void splitCharacters(CoreMap coreMap) {
        String str = (String) coreMap.get(CoreAnnotations.TextAnnotation.class);
        boolean z = true;
        ArrayList arrayList = new ArrayList();
        int length = str.length();
        int i = Integer.MAX_VALUE;
        int i2 = -1;
        Matcher matcher = xmlPattern.matcher(str);
        if (matcher.find()) {
            i = matcher.start();
            i2 = matcher.end();
        }
        int i3 = -1;
        int i4 = length;
        int i5 = 0;
        while (true) {
            int i6 = i5;
            if (i6 >= length) {
                break;
            }
            int codePointAt = str.codePointAt(i6);
            int charCount = Character.charCount(codePointAt);
            String substring = str.substring(i6, i6 + charCount);
            if (i3 == -1 && codePointAt != 10 && codePointAt != 13 && !System.lineSeparator().contains(substring)) {
                i3 = i6;
            }
            if (codePointAt != 10 && codePointAt != 13 && !System.lineSeparator().contains(substring)) {
                i4 = i6;
            }
            i5 = i6 + charCount;
        }
        LinkedList linkedList = new LinkedList();
        linkedList.addAll(Arrays.asList(false));
        int i7 = 0;
        while (true) {
            int i8 = i7;
            if (i8 >= length) {
                coreMap.set(SegmenterCoreAnnotations.CharactersAnnotation.class, arrayList);
                return;
            }
            int codePointAt2 = str.codePointAt(i8);
            int charCount2 = Character.charCount(codePointAt2);
            CoreLabel coreLabel = new CoreLabel();
            String substring2 = str.substring(i8, i8 + charCount2);
            if (i8 == i2) {
                Matcher matcher2 = xmlPattern.matcher(str);
                if (matcher2.find(i8)) {
                    i = matcher2.start();
                    i2 = matcher2.end();
                }
            }
            if (i8 == 0) {
                linkedList.add(Boolean.valueOf(codePointAt2 == 10));
            }
            int i9 = i8 + charCount2;
            if (i9 < str.length()) {
                linkedList.add(Boolean.valueOf(str.codePointAt(i9) == 10));
            } else {
                linkedList.add(false);
            }
            boolean z2 = false;
            boolean z3 = false;
            if (i8 == i) {
                z = true;
                z3 = true;
            } else if (i8 > i && i8 < i2) {
                z = false;
                z3 = true;
            } else if (Character.isSpaceChar(codePointAt2) || Character.isISOControl(codePointAt2)) {
                z = true;
                boolean booleanValue = ((Boolean) linkedList.get(0)).booleanValue();
                boolean booleanValue2 = ((Boolean) linkedList.get(1)).booleanValue();
                boolean booleanValue3 = ((Boolean) linkedList.get(2)).booleanValue();
                boolean z4 = i8 < i3 || i8 > i4;
                boolean z5 = (!booleanValue2 || booleanValue || booleanValue3) ? false : true;
                z2 = (this.tokenizeNewline && booleanValue2) ? false : true;
                if (z4) {
                    z2 = true;
                }
                if (this.sentenceSplitOnTwoNewlines && z5) {
                    z2 = true;
                }
            }
            if (!z2) {
                coreLabel.set(CoreAnnotations.ChineseCharAnnotation.class, substring2);
                if (z) {
                    coreLabel.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
                } else {
                    coreLabel.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
                }
                if (!z3) {
                    coreLabel.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "0");
                } else if (Character.isSpaceChar(codePointAt2) || Character.isISOControl(codePointAt2)) {
                    coreLabel.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, WhitespaceTokenizerFactory.NAME);
                } else if (i8 == i) {
                    coreLabel.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "beginning");
                } else {
                    coreLabel.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "1");
                }
                coreLabel.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, Integer.valueOf(i8));
                coreLabel.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, Integer.valueOf(i8 + charCount2));
                arrayList.add(coreLabel);
                z = false;
            }
            linkedList.poll();
            i7 = i8 + charCount2;
        }
    }

    private static int advancePos(List<CoreLabel> list, int i, String str) {
        String replaceAll = str.equals(StringUtils.CR) ? "\n" : str.replaceAll(StringUtils.CR, "");
        StringBuilder sb = new StringBuilder();
        while (!replaceAll.equals(sb.toString())) {
            if (i >= list.size()) {
                throw new RuntimeException("Ate the whole text without matching.  Expected is '" + replaceAll + "', ate '" + sb.toString() + OperatorName.SHOW_TEXT_LINE);
            }
            sb.append((String) list.get(i).get(CoreAnnotations.ChineseCharAnnotation.class));
            i++;
        }
        return i;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private void runSegmentation(CoreMap coreMap) {
        String replaceAll;
        List<String> arrayList;
        String str = (String) coreMap.get(CoreAnnotations.TextAnnotation.class);
        List list = (List) coreMap.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
        if (this.VERBOSE) {
            log.info("sentChars (length " + list.size() + ") is " + SentenceUtils.listToString(list, edu.stanford.nlp.util.StringUtils.EMPTY_STRING_ARRAY));
        }
        ArrayList arrayList2 = new ArrayList();
        coreMap.set(CoreAnnotations.TokensAnnotation.class, arrayList2);
        if (this.tokenizeNewline) {
            replaceAll = str.replaceAll("^[\\r\\n]+", "").replaceAll("[\\r\\n]+$", "");
            if (this.sentenceSplitOnTwoNewlines) {
                replaceAll = replaceAll.replaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2").replaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
            }
            List<String> splitLinesKeepNewlines = edu.stanford.nlp.util.StringUtils.splitLinesKeepNewlines(replaceAll);
            arrayList = new ArrayList();
            for (String str2 : splitLinesKeepNewlines) {
                if (separatorPattern.matcher(str2).matches()) {
                    arrayList.add(str2);
                } else {
                    arrayList.addAll(this.segmenter.segmentString(str2));
                }
            }
        } else {
            replaceAll = str.replaceAll("[\r\n]", "");
            arrayList = this.segmenter.segmentString(replaceAll);
        }
        if (this.VERBOSE) {
            log.info(replaceAll + "\n--->\n" + arrayList + " (length " + arrayList.size() + ')');
        }
        int i = 0;
        StringBuilder sb = new StringBuilder();
        int i2 = -1;
        for (String str3 : arrayList) {
            CoreLabel coreLabel = (CoreLabel) list.get(i);
            String str4 = (String) coreLabel.get(SegmenterCoreAnnotations.XMLCharAnnotation.class);
            if (this.VERBOSE) {
                log.info("Working on word " + str3 + ", sentChar " + coreLabel.toShorterString(new String[0]) + " (sentChars index " + i + ')');
            }
            if (("0".equals(str4) || "beginning".equals(str4)) && sb.length() > 0) {
                arrayList2.add(makeXmlToken(sb.toString(), true, i2, ((Integer) ((CoreLabel) list.get(i - 1)).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue()));
                i2 = -1;
                sb = new StringBuilder();
            }
            if ("0".equals(str4)) {
                coreLabel.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
                if (!str3.isEmpty()) {
                    int intValue = ((Integer) coreLabel.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue();
                    i = advancePos(list, i, str3);
                    if (i - 1 >= list.size()) {
                        log.error("Error: on word " + str3 + " at position " + (i - str3.length()) + " trying to get at position " + (i - 1));
                        log.error("last element of sentChars is " + list.get(list.size() - 1));
                    } else {
                        arrayList2.add(makeXmlToken(str3, false, intValue, ((Integer) ((CoreLabel) list.get(i - 1)).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue()));
                    }
                } else if (this.VERBOSE) {
                    log.warn("Encountered an empty word. Shouldn't happen?");
                }
            } else {
                while (((String) coreLabel.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals(WhitespaceTokenizerFactory.NAME)) {
                    sb.append(' ');
                    i++;
                    coreLabel = (CoreLabel) list.get(i);
                }
                sb.append(str3);
                i = advancePos(list, i, str3);
                if (i2 < 0) {
                    i2 = ((Integer) coreLabel.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue();
                }
            }
        }
        if (sb.length() > 0) {
            arrayList2.add(makeXmlToken(sb.toString(), true, i2, ((Integer) ((CoreLabel) list.get(i - 1)).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue()));
        }
        if (this.VERBOSE) {
            Iterator it = arrayList2.iterator();
            while (it.hasNext()) {
                log.info(((CoreLabel) it.next()).toShorterString(new String[0]));
            }
        }
    }

    private CoreLabel makeXmlToken(String str, boolean z, int i, int i2) {
        CoreLabel coreLabel = new CoreLabel();
        coreLabel.setOriginalText(str);
        if (separatorPattern.matcher(str).matches()) {
            str = "*NL*";
        } else if (z && this.normalizeSpace) {
            str = str.replace(' ', (char) 160);
        }
        coreLabel.setWord(str);
        coreLabel.setValue(str);
        coreLabel.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, Integer.valueOf(i));
        coreLabel.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, Integer.valueOf(i2));
        if (this.VERBOSE) {
            log.info("Adding token " + coreLabel.toShorterString(new String[0]));
        }
        return coreLabel;
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Class<? extends CoreAnnotation>> requires() {
        return Collections.emptySet();
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
        return new HashSet(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class));
    }
}
