package edu.stanford.nlp.ie.machinereading.domains.ace.reader;

import edu.stanford.nlp.ie.machinereading.common.DomReader;
import edu.stanford.nlp.ie.machinereading.domains.ace.reader.RobustTokenizer;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.Generics;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;

/* loaded from: input_file:BOOT-INF/lib/stanford-corenlp-4.0.0.jar:edu/stanford/nlp/ie/machinereading/domains/ace/reader/AceSentenceSegmenter.class */
public class AceSentenceSegmenter extends DomReader {
    private static final String[] sentenceFinalPunc = {".", "!", "?"};
    private static Set<String> sentenceFinalPuncSet = Generics.newHashSet();

    public static List<List<AceToken>> tokenizeAndSegmentSentences(String str) throws IOException, SAXException, ParserConfigurationException {
        ArrayList arrayList = new ArrayList();
        List<RobustTokenizer.WordToken> list = new RobustTokenizer(IOUtils.slurpFile(new File(str + AceDocument.ORIG_EXT))).tokenizeToWordTokens();
        ArrayList arrayList2 = new ArrayList();
        int i = 0;
        int i2 = 0;
        while (i2 < list.size()) {
            RobustTokenizer.WordToken wordToken = list.get(i2);
            String word = wordToken.getWord();
            AceToken wordTokenToAceToken = wordTokenToAceToken(wordToken, arrayList.size());
            if (AceToken.isSgml(word)) {
                if (arrayList2.size() > 0) {
                    arrayList.add(arrayList2);
                }
                arrayList2 = new ArrayList();
                i = 0;
            }
            arrayList2.add(wordTokenToAceToken);
            if (word.equals("\"")) {
                i++;
            }
            if (sentenceFinalPuncSet.contains(word)) {
                if (i2 < list.size() - 1 && i % 2 == 1 && list.get(i2 + 1).getWord().equals("\"")) {
                    arrayList2.add(wordTokenToAceToken(list.get(i2 + 1), arrayList.size()));
                    int i3 = i + 1;
                    i2++;
                }
                if (arrayList2.size() > 0) {
                    arrayList.add(arrayList2);
                }
                arrayList2 = new ArrayList();
                i = 0;
            } else if (AceToken.isSgml(word)) {
                if (arrayList2.size() > 0) {
                    arrayList.add(arrayList2);
                }
                arrayList2 = new ArrayList();
                i = 0;
            }
            i2++;
        }
        return arrayList;
    }

    public static AceToken wordTokenToAceToken(RobustTokenizer.WordToken wordToken, int i) {
        return new AceToken(wordToken.getWord(), "", "", "", "", Integer.toString(wordToken.getStart()), Integer.toString(wordToken.getEnd()), i);
    }

    public static void main(String[] strArr) throws IOException, SAXException, ParserConfigurationException {
        Iterator<List<AceToken>> it = tokenizeAndSegmentSentences("/home/mcclosky/data/ACE2005/English/nw/timex2norm/AFP_ENG_20030502.0614").iterator();
        while (it.hasNext()) {
            System.out.println("s: [" + it.next() + "]");
        }
    }

    static {
        for (String str : sentenceFinalPunc) {
            sentenceFinalPuncSet.add(str);
        }
    }
}
