package edu.stanford.nlp.wordseg;

import com.ibm.icu.text.PluralRules;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.ObjectOutputStream;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/wordseg/ChineseDictionary.class */
public class ChineseDictionary {
    private static final boolean DEBUG = false;
    public static final int MAX_LEXICON_LENGTH = 6;
    private final Set<String>[] words_;
    private final ChineseDocumentToSentenceProcessor cdtos_;
    private static Redwood.RedwoodChannels logger = Redwood.channels(ChineseDictionary.class);
    private static final Pattern midDot = Pattern.compile(ChineseUtils.MID_DOT_REGEX_STR);

    private void serializeDictionary(String str) {
        logger.info("Serializing dictionaries to " + str + " ... ");
        try {
            ObjectOutputStream writeStreamFromString = IOUtils.writeStreamFromString(str);
            writeStreamFromString.writeObject(this.words_);
            writeStreamFromString.close();
            logger.info("done.");
        } catch (Exception e) {
            logger.error("Failed", e);
            throw new RuntimeIOException(e);
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    private static Set<String>[] loadDictionary(String str) {
        HashSet[] hashSetArr = new HashSet[7];
        for (int i = 0; i <= 6; i++) {
            hashSetArr[i] = Generics.newHashSet();
        }
        try {
            return (Set[]) IOUtils.readObjectFromURLOrClasspathOrFileSystem(str);
        } catch (Exception e) {
            logger.error("Failed to load Chinese dictionary " + str, e);
            throw new RuntimeException(e);
        }
    }

    public ChineseDictionary(String str) {
        this(new String[]{str});
    }

    public ChineseDictionary(String[] strArr) {
        this(strArr, null);
    }

    public ChineseDictionary(String[] strArr, ChineseDocumentToSentenceProcessor chineseDocumentToSentenceProcessor) {
        this(strArr, chineseDocumentToSentenceProcessor, false);
    }

    public ChineseDictionary(String str, ChineseDocumentToSentenceProcessor chineseDocumentToSentenceProcessor, boolean z) {
        this(str.split(","), chineseDocumentToSentenceProcessor, z);
    }

    public ChineseDictionary(String[] strArr, ChineseDocumentToSentenceProcessor chineseDocumentToSentenceProcessor, boolean z) {
        this.words_ = new HashSet[7];
        Redwood.RedwoodChannels redwoodChannels = logger;
        Object[] objArr = new Object[1];
        Object[] objArr2 = new Object[2];
        objArr2[0] = Integer.valueOf(strArr.length);
        objArr2[1] = strArr.length == 1 ? "" : "s";
        objArr[0] = String.format("Loading Chinese dictionaries from %d file%s:%n", objArr2);
        redwoodChannels.info(objArr);
        for (String str : strArr) {
            logger.info("  " + str);
        }
        for (int i = 0; i <= 6; i++) {
            this.words_[i] = Generics.newHashSet();
        }
        this.cdtos_ = chineseDocumentToSentenceProcessor;
        for (String str2 : strArr) {
            if (str2.endsWith("ser.gz")) {
                Set<String>[] loadDictionary = loadDictionary(str2);
                for (int i2 = 0; i2 <= 6; i2++) {
                    this.words_[i2].addAll(loadDictionary[i2]);
                    loadDictionary[i2] = null;
                }
            } else {
                addDict(str2, z);
            }
        }
        int i3 = 0;
        for (int i4 = 0; i4 <= 6; i4++) {
            i3 += this.words_[i4].size();
        }
        logger.info(String.format("Done. Unique words in ChineseDictionary is: %d.%n", Integer.valueOf(i3)));
    }

    private void addDict(String str, boolean z) {
        String[] split = IOUtils.slurpFileNoExceptions(str, "utf-8").split("\n");
        logger.info("  " + str + PluralRules.KEYWORD_RULE_SEPARATOR + split.length + " entries");
        for (String str2 : split) {
            String trim = str2.trim();
            if (z) {
                trim = trim.replaceAll(ChineseUtils.MID_DOT_REGEX_STR, "·");
            }
            addOneDict(trim);
            if (z && midDot.matcher(trim).find()) {
                addOneDict(trim.replaceAll(ChineseUtils.MID_DOT_REGEX_STR, ""));
            }
        }
    }

    private void addOneDict(String str) {
        int length = str.length();
        if (length == 0) {
            return;
        }
        if (length <= 5) {
            if (this.cdtos_ != null) {
                str = this.cdtos_.normalization(str);
            }
            this.words_[length].add(str);
        } else {
            String str2 = new String(str.substring(0, 6));
            if (this.cdtos_ != null) {
                str2 = this.cdtos_.normalization(str2);
            }
            this.words_[6].add(str2);
        }
    }

    public boolean contains(String str) {
        int length = str.length();
        return length <= 5 ? this.words_[length].contains(str) : this.words_[6].contains(str.substring(0, 6));
    }

    public static void main(String[] strArr) {
        Map newHashMap = Generics.newHashMap();
        newHashMap.put("-inputDicts", 1);
        newHashMap.put("-output", 1);
        Map<String, String[]> argsToMap = StringUtils.argsToMap(strArr, newHashMap);
        new ChineseDictionary((argsToMap.keySet().contains("-inputDicts") ? argsToMap.get("-inputDicts")[0] : "/u/nlp/data/chinese-dictionaries/plain/ne_wikipedia-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/newsexplorer_entities_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/Ch-name-list-utf8.txt,/u/nlp/data/chinese-dictionaries/plain/wikilex-20070908-zh-en.txt,/u/nlp/data/chinese-dictionaries/plain/adso-1.25-050405-monolingual-clean.utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_108k_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_mandarintools_normalized.txt,/u/nlp/data/chinese-dictionaries/plain/harbin-ChineseNames_utf8.txt,/u/nlp/data/chinese-dictionaries/plain/lexicon_HowNet_normalized.txt").split(","), new ChineseDocumentToSentenceProcessor(null), true).serializeDictionary(argsToMap.keySet().contains("-output") ? argsToMap.get("-output")[0] : "/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz");
    }
}
