package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.eclipse.jgit.transport.WalkEncryption;

/* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/process/DocumentPreprocessor.class */
public class DocumentPreprocessor implements Iterable<List<HasWord>> {
    private Reader inputReader;
    private final DocType docType;
    private TokenizerFactory<? extends HasWord> tokenizerFactory;
    private String[] sentenceFinalPuncWords;
    private Function<List<HasWord>, List<HasWord>> escaper;
    private String sentenceDelimiter;
    private String tagDelimiter;
    private String elementDelimiter;
    private final String[] sentenceFinalFollowers;
    private boolean keepEmptySentences;
    private static final Redwood.RedwoodChannels log = Redwood.channels(DocumentPreprocessor.class);
    private static final String[] DEFAULT_SENTENCE_DELIMS = {".", "?", "!", "!!", "!!!", "??", "?!", "!?"};
    private static final Pattern wsPattern = Pattern.compile(WalkEncryption.Vals.REGEX_WS);

    /* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/process/DocumentPreprocessor$DocType.class */
    public enum DocType {
        Plain,
        XML
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/process/DocumentPreprocessor$PlainTextIterator.class */
    public class PlainTextIterator implements Iterator<List<HasWord>> {
        private final Tokenizer<? extends HasWord> tokenizer;
        private final Set<String> delimFollowers;
        private final Function<String, String[]> splitTag;
        private List<HasWord> nextSent;
        private final List<HasWord> nextSentCarryover = Generics.newArrayList();
        private final Set<String> sentDelims = Generics.newHashSet();

        public PlainTextIterator() {
            boolean z = false;
            if (DocumentPreprocessor.this.sentenceDelimiter == null) {
                if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                    this.sentDelims.addAll(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
                }
                this.delimFollowers = Generics.newHashSet(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            } else {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = Generics.newHashSet();
                z = DocumentPreprocessor.wsPattern.matcher(DocumentPreprocessor.this.sentenceDelimiter).matches();
                if (z) {
                    this.sentDelims.add(PTBTokenizer.getNewlineToken());
                }
            }
            if (DocumentPreprocessor.this.tokenizerFactory == null) {
                this.tokenizer = WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(DocumentPreprocessor.this.inputReader, this.sentDelims.contains("*NL*"));
            } else if (z) {
                this.tokenizer = DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader, "tokenizeNLs");
            } else {
                this.tokenizer = DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader);
            }
            if (DocumentPreprocessor.this.tagDelimiter == null) {
                this.splitTag = null;
            } else {
                this.splitTag = new Function<String, String[]>() { // from class: edu.stanford.nlp.process.DocumentPreprocessor.PlainTextIterator.1
                    private final String splitRegex;

                    {
                        this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                    }

                    @Override // java.util.function.Function
                    public String[] apply(String str) {
                        String[] split = str.trim().split(this.splitRegex);
                        return split.length == 2 ? split : new String[]{str};
                    }
                };
            }
        }

        private void primeNext() {
            if (DocumentPreprocessor.this.inputReader == null) {
                return;
            }
            this.nextSent = Generics.newArrayList(this.nextSentCarryover);
            this.nextSentCarryover.clear();
            boolean z = false;
            if (!this.tokenizer.hasNext()) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                if (this.nextSent.isEmpty()) {
                    this.nextSent = null;
                    return;
                }
                return;
            }
            while (true) {
                HasWord next = this.tokenizer.next();
                if (this.splitTag != null) {
                    String[] apply = this.splitTag.apply(next.word());
                    next.setWord(apply[0]);
                    if (next instanceof Label) {
                        ((Label) next).setValue(apply[0]);
                    }
                    if (apply.length == 2 && (next instanceof HasTag)) {
                        ((HasTag) next).setTag(apply[1]);
                    }
                }
                if (!this.sentDelims.contains(next.word())) {
                    if (z && !this.delimFollowers.contains(next.word())) {
                        this.nextSentCarryover.add(next);
                        break;
                    }
                } else {
                    z = true;
                }
                if (!DocumentPreprocessor.wsPattern.matcher(next.word()).matches() && !next.word().equals(PTBTokenizer.getNewlineToken())) {
                    this.nextSent.add(next);
                }
                if (z && this.delimFollowers.isEmpty()) {
                    if (!this.nextSent.isEmpty() || DocumentPreprocessor.this.keepEmptySentences) {
                        break;
                    } else {
                        z = false;
                    }
                }
                if (!this.tokenizer.hasNext()) {
                    break;
                }
            }
            if (this.nextSent.isEmpty() && this.nextSentCarryover.isEmpty() && !DocumentPreprocessor.this.keepEmptySentences) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List) DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            if (this.nextSent == null) {
                primeNext();
            }
            return this.nextSent != null;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public List<HasWord> next() {
            if (this.nextSent == null) {
                primeNext();
            }
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> list = this.nextSent;
            this.nextSent = null;
            return list;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/process/DocumentPreprocessor$XMLIterator.class */
    public class XMLIterator implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator<>(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                } else {
                    if (!this.xmlItr.hasNext()) {
                        IOUtils.closeIgnoringExceptions(this.originalDocReader);
                        this.nextSent = null;
                        return;
                    }
                    String next = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(next));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                    } else {
                        this.nextSent = null;
                    }
                }
            } while (this.nextSent == null);
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return this.nextSent != null;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> list = this.nextSent;
            primeNext();
            return list;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public DocumentPreprocessor(Reader reader) {
        this(reader, DocType.Plain);
    }

    public DocumentPreprocessor(Reader reader, DocType docType) {
        this.tokenizerFactory = PTBTokenizer.coreLabelFactory();
        this.sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS;
        this.elementDelimiter = ".*";
        this.sentenceFinalFollowers = new String[]{")", "]", "\"", OperatorName.SHOW_TEXT_LINE, "''", "-RRB-", "-RSB-", "-RCB-"};
        if (reader == null) {
            throw new IllegalArgumentException("Cannot read from null object!");
        }
        this.docType = docType;
        this.inputReader = reader;
    }

    public DocumentPreprocessor(String str) {
        this(str, DocType.Plain, "UTF-8");
    }

    public DocumentPreprocessor(String str, DocType docType) {
        this(str, docType, "UTF-8");
    }

    public DocumentPreprocessor(String str, DocType docType, String str2) {
        this.tokenizerFactory = PTBTokenizer.coreLabelFactory();
        this.sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS;
        this.elementDelimiter = ".*";
        this.sentenceFinalFollowers = new String[]{")", "]", "\"", OperatorName.SHOW_TEXT_LINE, "''", "-RRB-", "-RSB-", "-RCB-"};
        if (str == null) {
            throw new IllegalArgumentException("Cannot open null document path!");
        }
        this.docType = docType;
        try {
            this.inputReader = IOUtils.readerFromString(str, str2);
        } catch (IOException e) {
            throw new RuntimeIOException(String.format("%s: Could not open path %s", getClass().getName(), str), e);
        }
    }

    public void setKeepEmptySentences(boolean z) {
        this.keepEmptySentences = z;
    }

    public void setSentenceFinalPuncWords(String[] strArr) {
        this.sentenceFinalPuncWords = strArr;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> function) {
        this.escaper = function;
    }

    public void setSentenceDelimiter(String str) {
        this.sentenceDelimiter = str;
    }

    public void setTagDelimiter(String str) {
        this.tagDelimiter = str;
    }

    public void setElementDelimiter(String str) {
        this.elementDelimiter = str;
    }

    @Override // java.lang.Iterable
    public Iterator<List<HasWord>> iterator() {
        if (this.docType == DocType.Plain) {
            return new PlainTextIterator();
        }
        if (this.docType == DocType.XML) {
            return new XMLIterator();
        }
        throw new IllegalStateException("Someone didn't add a handler for a new docType.");
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String lineSeparator = System.lineSeparator();
        sb.append(String.format("Usage: java %s [OPTIONS] [file] [< file]%n%n", DocumentPreprocessor.class.getName()));
        sb.append("Options:").append(lineSeparator);
        sb.append("-xml delim              : XML input with associated delimiter.").append(lineSeparator);
        sb.append("-encoding type          : Input encoding (default: UTF-8).").append(lineSeparator);
        sb.append("-printSentenceLengths   : ").append(lineSeparator);
        sb.append("-noTokenization         : Split on newline delimiters only.").append(lineSeparator);
        sb.append("-printOriginalText      : Print the original, not normalized form of tokens.").append(lineSeparator);
        sb.append("-suppressEscaping       : Suppress PTB escaping.").append(lineSeparator);
        sb.append("-tokenizerOptions opts  : Specify custom tokenizer options.").append(lineSeparator);
        sb.append("-tag delim              : Input tokens are tagged. Split tags.").append(lineSeparator);
        sb.append("-whitespaceTokenization : Whitespace tokenization only.").append(lineSeparator);
        sb.append("-sentenceDelimiter delim: Split sentences on this also (\"newline\" for \\n)").append(lineSeparator);
        return sb.toString();
    }

    private static Map<String, Integer> argOptionDefs() {
        Map<String, Integer> newHashMap = Generics.newHashMap();
        newHashMap.put("help", 0);
        newHashMap.put("xml", 1);
        newHashMap.put("encoding", 1);
        newHashMap.put("printSentenceLengths", 0);
        newHashMap.put("noTokenization", 0);
        newHashMap.put("suppressEscaping", 0);
        newHashMap.put("tag", 1);
        newHashMap.put("tokenizerOptions", 1);
        newHashMap.put("whitespaceTokenization", 0);
        newHashMap.put("sentenceDelimiter", 1);
        return newHashMap;
    }

    public static void main(String[] strArr) throws IOException {
        Properties argsToProperties = StringUtils.argsToProperties(strArr, argOptionDefs());
        if (argsToProperties.containsKey("help")) {
            log.info(usage());
            return;
        }
        String property = argsToProperties.getProperty("encoding", "utf-8");
        boolean bool = PropertiesUtils.getBool(argsToProperties, "printSentenceLengths", false);
        String property2 = argsToProperties.getProperty("xml", null);
        DocType docType = property2 == null ? DocType.Plain : DocType.XML;
        String property3 = argsToProperties.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
        String property4 = argsToProperties.getProperty("sentenceDelimiter");
        if (property4 != null) {
            property3 = property4.equalsIgnoreCase("newline") ? "\n" : property4;
        }
        String property5 = argsToProperties.getProperty("tag", null);
        String[] strArr2 = null;
        boolean containsKey = argsToProperties.containsKey("suppressEscaping");
        int i = containsKey ? 0 + 1 : 0;
        boolean containsKey2 = argsToProperties.containsKey("tokenizerOptions");
        if (containsKey2) {
            i++;
        }
        boolean containsKey3 = argsToProperties.containsKey("printOriginalText");
        if (containsKey3) {
            i++;
        }
        boolean containsKey4 = argsToProperties.containsKey("whitespaceTokenization");
        if (containsKey4) {
            i++;
        }
        if (i > 1) {
            log.info("Only one tokenizer flag allowed at a time: ");
            log.info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
            return;
        }
        TokenizerFactory<? extends HasWord> tokenizerFactory = null;
        if (containsKey) {
            tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
        } else if (containsKey2) {
            tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), argsToProperties.getProperty("tokenizerOptions"));
        } else if (containsKey3) {
            tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
        } else if (containsKey4) {
            ArrayList arrayList = new ArrayList(Arrays.asList(DEFAULT_SENTENCE_DELIMS));
            arrayList.add("*NL*");
            strArr2 = (String[]) arrayList.toArray(new String[arrayList.size()]);
        } else {
            tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        }
        String property6 = argsToProperties.getProperty("", null);
        String[] split = property6 == null ? new String[1] : property6.split(WalkEncryption.Vals.REGEX_WS);
        int i2 = 0;
        PrintWriter printWriter = new PrintWriter((Writer) new OutputStreamWriter(System.out, property), true);
        int length = split.length;
        for (int i3 = 0; i3 < length; i3++) {
            String str = split[i3];
            DocumentPreprocessor documentPreprocessor = (str == null || str.isEmpty()) ? new DocumentPreprocessor(new InputStreamReader(System.in, property)) : new DocumentPreprocessor(str, docType, property);
            if (docType == DocType.XML) {
                documentPreprocessor.setElementDelimiter(property2);
            }
            documentPreprocessor.setTokenizerFactory(tokenizerFactory);
            if (property3 != null) {
                documentPreprocessor.setSentenceDelimiter(property3);
            }
            if (property5 != null) {
                documentPreprocessor.setTagDelimiter(property5);
            }
            if (strArr2 != null) {
                documentPreprocessor.setSentenceFinalPuncWords(strArr2);
            }
            Iterator<List<HasWord>> it = documentPreprocessor.iterator();
            while (it.hasNext()) {
                List<HasWord> next = it.next();
                i2++;
                if (bool) {
                    System.err.printf("Length: %d%n", Integer.valueOf(next.size()));
                }
                boolean z = false;
                for (HasWord hasWord : next) {
                    if (containsKey3) {
                        CoreLabel coreLabel = (CoreLabel) hasWord;
                        if (!z) {
                            printWriter.print((String) coreLabel.get(CoreAnnotations.BeforeAnnotation.class));
                            z = true;
                        }
                        printWriter.print((String) coreLabel.get(CoreAnnotations.OriginalTextAnnotation.class));
                        printWriter.print((String) coreLabel.get(CoreAnnotations.AfterAnnotation.class));
                    } else {
                        if (z) {
                            printWriter.print(" ");
                        }
                        z = true;
                        printWriter.print(hasWord.word());
                    }
                }
                printWriter.println();
            }
        }
        printWriter.close();
        System.err.printf("Read in %d sentences.%n", Integer.valueOf(i2));
    }
}
