package de.unihd.dbs.uima.reader.aceternreader;

import de.unihd.dbs.uima.types.heideltime.Dct;
import de.unihd.dbs.uima.types.heideltime.SourceDocInfo;
import edu.stanford.nlp.ie.pascal.ISODateInstance;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.preflight.PreflightConstants;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.FileUtils;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.eclipse.jdt.internal.compiler.impl.CompilerOptions;

/* loaded from: input_file:BOOT-INF/lib/heideltime-2.2.1.jar:de/unihd/dbs/uima/reader/aceternreader/ACETernReader.class */
public class ACETernReader extends CollectionReader_ImplBase {
    private static Logger logger = null;
    private static final String compontent_id = "de.unihd.dbs.uima.reader.aceternreader";
    public static final String PARAM_INPUTDIR = "InputDirectory";
    public static final String PARAM_DCT = "AnnotateCreationTime";
    private ArrayList<File> mFiles;
    private int currentIndex;
    public Integer numberOfDocuments = 0;
    public Boolean annotateDCT = false;

    @Override // org.apache.uima.collection.CollectionReader_ImplBase
    public void initialize() throws ResourceInitializationException {
        logger = getUimaContext().getLogger();
        logger.log(Level.INFO, "initialize() - Initializing ACETern-Reader...");
        this.annotateDCT = (Boolean) getConfigParameterValue(PARAM_DCT);
        File file = new File(((String) getConfigParameterValue("InputDirectory")).trim());
        this.currentIndex = 0;
        if (!file.exists() || !file.isDirectory()) {
            throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND, new Object[]{"InputDirectory", getMetaData().getName(), file.getPath()});
        }
        this.mFiles = new ArrayList<>();
        File[] listFiles = file.listFiles();
        for (int i = 0; i < listFiles.length; i++) {
            if (!listFiles[i].isDirectory()) {
                this.mFiles.add(listFiles[i]);
            }
        }
    }

    @Override // org.apache.uima.collection.base_cpm.BaseCollectionReader
    public boolean hasNext() {
        return this.currentIndex < this.mFiles.size();
    }

    @Override // org.apache.uima.collection.CollectionReader
    public void getNext(CAS cas) throws IOException, CollectionException {
        System.err.print(".");
        try {
            JCas jCas = cas.getJCas();
            ArrayList<File> arrayList = this.mFiles;
            int i = this.currentIndex;
            this.currentIndex = i + 1;
            File file = arrayList.get(i);
            logger.log(Level.INFO, "getNext(CAS) - Reading file " + file.getName());
            String file2String = FileUtils.file2String(file);
            jCas.setDocumentText(file2String.replaceAll("(?s)<QUOTE PREVIOUSPOST=.*?/>", ""));
            SourceDocInfo sourceDocInfo = new SourceDocInfo(jCas);
            URL url = file.getAbsoluteFile().toURI().toURL();
            sourceDocInfo.setUri(url.toString());
            sourceDocInfo.addToIndexes();
            if (this.annotateDCT.booleanValue()) {
                setDCT(file2String, jCas, url.toString());
            }
        } catch (CASException e) {
            throw new CollectionException(e);
        }
    }

    public void setDCT(String str, JCas jCas, String str2) {
        String str3 = null;
        Iterator<MatchResult> it = findMatches(Pattern.compile("(<DATETIME>|<DATE_TIME>|<DATE>|<STORY_REF_TIME>)(((.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d\\d):(\\d\\d):(\\d\\d)(.*?))|((.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d):(\\d\\d):(\\d\\d)(.*?))|((.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?))|((.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?))|((.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(.*?))|((.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d)(.*?))|((.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?))|((.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)-(\\d\\d):(\\d\\d):(\\d\\d)(.*?))|((.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?))(</DATETIME>|</DATE_TIME>|</DATE>|</STORY_REF_TIME>))"), str).iterator();
        while (it.hasNext()) {
            str3 = it.next().group(2);
        }
        String str4 = null;
        String str5 = null;
        if (str3 != null) {
            if (str3.matches("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d\\d):(\\d\\d):(\\d\\d)(.*?)")) {
                for (MatchResult matchResult : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult.group(2) + "-" + matchResult.group(3) + "-" + matchResult.group(4);
                    str4 = matchResult.group(2) + "-" + matchResult.group(3) + "-" + matchResult.group(4) + "T" + matchResult.group(6) + ":" + matchResult.group(7) + ":" + matchResult.group(8);
                }
            } else if (str3.matches("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d):(\\d\\d):(\\d\\d)(.*?)")) {
                for (MatchResult matchResult2 : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(T| )(\\d):(\\d\\d):(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult2.group(2) + "-" + matchResult2.group(3) + "-" + matchResult2.group(4);
                    str4 = matchResult2.group(2) + "-" + matchResult2.group(3) + "-" + matchResult2.group(4) + "T0" + matchResult2.group(6) + ":" + matchResult2.group(7) + ":" + matchResult2.group(8);
                }
            } else if (str3.matches("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)")) {
                for (MatchResult matchResult3 : findMatches(Pattern.compile("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult3.group(4) + "-" + matchResult3.group(2) + "-" + matchResult3.group(3);
                    str4 = matchResult3.group(4) + "-" + matchResult3.group(2) + "-" + matchResult3.group(3) + "T" + matchResult3.group(5) + ":" + matchResult3.group(6) + ":" + matchResult3.group(7) + "." + matchResult3.group(8);
                }
            } else if (str3.matches("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)")) {
                for (MatchResult matchResult4 : findMatches(Pattern.compile("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d) (\\d):(\\d\\d):(\\d\\d)\\.(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult4.group(4) + "-" + matchResult4.group(2) + "-" + matchResult4.group(3);
                    str4 = matchResult4.group(4) + "-" + matchResult4.group(2) + "-" + matchResult4.group(3) + "T0" + matchResult4.group(5) + ":" + matchResult4.group(6) + ":" + matchResult4.group(7) + "." + matchResult4.group(8);
                }
            } else if (str3.matches("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(.*?)")) {
                for (MatchResult matchResult5 : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)-(\\d\\d)-(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult5.group(2) + "-" + matchResult5.group(3) + "-" + matchResult5.group(4);
                }
            } else if (str3.matches("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d)(.*?)")) {
                for (MatchResult matchResult6 : findMatches(Pattern.compile("(.*?)(\\d\\d)/(\\d\\d)/(\\d\\d\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult6.group(4) + "-" + matchResult6.group(2) + "-" + matchResult6.group(3);
                }
            } else if (str3.matches("(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)")) {
                for (MatchResult matchResult7 : findMatches(Pattern.compile("(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult7.group(4) + "-" + normMonth(matchResult7.group(2)) + "-" + normDay(matchResult7.group(3));
                }
            } else if (str3.matches("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)-(\\d\\d):(\\d\\d):(\\d\\d)(.*?)")) {
                for (MatchResult matchResult8 : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)-(\\d\\d):(\\d\\d):(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult8.group(2) + "-" + matchResult8.group(3) + "-" + matchResult8.group(4);
                    str4 = matchResult8.group(2) + "-" + matchResult8.group(3) + "-" + matchResult8.group(4) + "T" + matchResult8.group(5) + ":" + matchResult8.group(6) + ":" + matchResult8.group(7);
                }
            } else if (str3.matches("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)")) {
                for (MatchResult matchResult9 : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)"), str3)) {
                    str5 = matchResult9.group(2) + "-" + matchResult9.group(3) + "-" + matchResult9.group(4);
                }
            } else {
                System.err.println();
                System.err.println("[de.unihd.dbs.uima.reader.aceternreader] cannot set dct with datetimetag: " + str3);
            }
            if (str5 != null) {
                Dct dct = new Dct(jCas);
                dct.setBegin(0);
                dct.setEnd(1);
                dct.setFilename(str2);
                dct.setTimexId("dct");
                if (str4 != null) {
                    dct.setValue(str4);
                } else if (str5 != null) {
                    dct.setValue(str5);
                } else {
                    System.err.println();
                    System.err.println("[de.unihd.dbs.uima.reader.aceternreader] something wrong with setting DCT of : " + str3);
                }
                dct.addToIndexes();
                return;
            }
            return;
        }
        if (0 == 0) {
            String str6 = "";
            Iterator<MatchResult> it2 = findMatches(Pattern.compile("DATE:[\\s]+((.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?))"), str).iterator();
            while (it2.hasNext()) {
                String group = it2.next().group(1);
                if (group.matches("(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)")) {
                    for (MatchResult matchResult10 : findMatches(Pattern.compile("(.*?)(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]),? (\\d\\d\\d\\d)(.*?)"), group)) {
                        str6 = matchResult10.group(4);
                        normMonth(matchResult10.group(2));
                        normDay(matchResult10.group(3));
                    }
                }
            }
            for (MatchResult matchResult11 : findMatches(Pattern.compile("<STORY_REF_TIME>(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d])</STORY_REF_TIME>"), str)) {
                str5 = str6 + "-" + normMonth(matchResult11.group(1)) + "-" + normDay(matchResult11.group(2));
            }
        }
        if (str5 == null) {
            for (MatchResult matchResult12 : findMatches(Pattern.compile("<STORY_REF_TIME>.*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*?</STORY_REF_TIME>"), str)) {
                str5 = matchResult12.group(1) + "-" + matchResult12.group(2) + "-" + matchResult12.group(3);
            }
        }
        if (str5 == null) {
            String str7 = "";
            for (MatchResult matchResult13 : findMatches(Pattern.compile("<DOCNO>.*?(\\d\\d\\d\\d)(\\d\\d)(\\d\\d).*?</DOCNO>"), str)) {
                str7 = matchResult13.group(1);
                normMonth(matchResult13.group(2));
                normDay(matchResult13.group(3));
            }
            if (!str7.matches("")) {
                for (MatchResult matchResult14 : findMatches(Pattern.compile("<STORY_REF_TIME>.*?(January|February|March|April|May|June|July|August|September|October|November|December) ([\\d]?[\\d]).*?</STORY_REF_TIME>"), str)) {
                    str5 = str7 + "-" + normMonth(matchResult14.group(1)) + "-" + normDay(matchResult14.group(2));
                }
            }
        }
        if (str5 == null) {
            String str8 = "";
            for (MatchResult matchResult15 : findMatches(Pattern.compile("Publish Date:[\\s]+(\\d\\d)/(\\d\\d)/(\\d\\d)"), str)) {
                str8 = "19" + matchResult15.group(3);
                normMonth(matchResult15.group(1));
                normDay(matchResult15.group(2));
            }
            if (!str8.matches("")) {
                for (MatchResult matchResult16 : findMatches(Pattern.compile("<STORY_REF_TIME>.*?(Jan\\.|Feb\\.|Mar\\.|Apr\\.|May\\.|Jun\\.|Jul\\.|Aug\\.|Sep\\.|Oct\\.|Nov\\.|Dec\\.|JAN\\.|FEB\\.|MAR\\.|APR\\.|MAY\\.|JUN\\.|JUL\\.|AUG\\.|SEP\\.|OCT\\.|NOV\\.|DEC\\.)[\\s]+([\\d]?[\\d]).*?</STORY_REF_TIME>"), str)) {
                    str5 = str8 + "-" + normMonth(matchResult16.group(1)) + "-" + normDay(matchResult16.group(2));
                }
            }
        }
        if (str5 == null) {
            try {
                Iterator<MatchResult> it3 = findMatches(Pattern.compile("(<DOC ID=\".*?\" DATE=\")((.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?))(\">)"), str).iterator();
                while (it3.hasNext()) {
                    str3 = it3.next().group(2);
                }
                if (str3.matches("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)")) {
                    for (MatchResult matchResult17 : findMatches(Pattern.compile("(.*?)(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*?)"), str3)) {
                        str5 = matchResult17.group(2) + "-" + matchResult17.group(3) + "-" + matchResult17.group(4);
                    }
                } else {
                    System.err.println();
                    System.err.println("[de.unihd.dbs.uima.reader.aceternreader] cannot set dct with datetimetag: " + str3);
                }
            } catch (NullPointerException e) {
            }
        }
        if (str5 == null) {
            System.err.println();
            System.err.println("[de.unihd.dbs.uima.reader.aceternreader] Cannot set Document Creation Time - no datetimetag found in " + str2 + "!");
            return;
        }
        Dct dct2 = new Dct(jCas);
        dct2.setBegin(0);
        dct2.setEnd(1);
        dct2.setFilename(str2);
        dct2.setTimexId("dct");
        dct2.setValue(str5);
        dct2.addToIndexes();
    }

    public String normDay(String str) {
        if (!str.matches("\\d\\d")) {
            if (str.equals("1")) {
                str = "01";
            } else if (str.equals("2")) {
                str = "02";
            } else if (str.equals("3")) {
                str = "03";
            } else if (str.equals(PreflightConstants.ERROR_TRANSPARENCY_MAIN)) {
                str = "04";
            } else if (str.equals("5")) {
                str = "05";
            } else if (str.equals(PreflightConstants.ERROR_ACTION_MAIN)) {
                str = "06";
            } else if (str.equals("7")) {
                str = ISODateInstance.MONTH_OF_HALF_YEAR;
            } else if (str.equals(PreflightConstants.ERROR_PDF_PROCESSING)) {
                str = "08";
            } else if (str.equals(CompilerOptions.VERSION_9)) {
                str = "09";
            }
        }
        return str;
    }

    public String normMonth(String str) {
        if (str.toLowerCase().startsWith("jan")) {
            str = "01";
        } else if (str.toLowerCase().startsWith("feb")) {
            str = "02";
        } else if (str.toLowerCase().startsWith("mar")) {
            str = "03";
        } else if (str.toLowerCase().startsWith("apr")) {
            str = "04";
        } else if (str.toLowerCase().startsWith("may")) {
            str = "05";
        } else if (str.toLowerCase().startsWith("jun")) {
            str = "06";
        } else if (str.toLowerCase().startsWith("jul")) {
            str = ISODateInstance.MONTH_OF_HALF_YEAR;
        } else if (str.toLowerCase().startsWith("aug")) {
            str = "08";
        } else if (str.toLowerCase().startsWith("sep")) {
            str = "09";
        } else if (str.toLowerCase().startsWith("oct")) {
            str = "10";
        } else if (str.toLowerCase().startsWith("nov")) {
            str = CompilerOptions.VERSION_11;
        } else if (str.toLowerCase().startsWith("dec")) {
            str = "12";
        }
        return str;
    }

    @Override // org.apache.uima.collection.base_cpm.BaseCollectionReader
    public void close() throws IOException {
    }

    @Override // org.apache.uima.collection.base_cpm.BaseCollectionReader
    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.currentIndex, this.mFiles.size(), "entities")};
    }

    public int getNumberOfDocuments() {
        return this.mFiles.size();
    }

    public static Iterable<MatchResult> findMatches(Pattern pattern, CharSequence charSequence) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = pattern.matcher(charSequence);
        while (matcher.find()) {
            arrayList.add(matcher.toMatchResult());
        }
        return arrayList;
    }
}
