package gate.creole.splitter;

import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.BomStrippingInputStreamReader;
import gate.util.InvalidOffsetException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:WEB-INF/lib/gate-core-6.1.jar:gate/creole/splitter/RegexSentenceSplitter.class */
public class RegexSentenceSplitter extends AbstractLanguageAnalyser {
    public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";
    public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
    public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
    public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";
    public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";
    public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";
    private static final long serialVersionUID = 1;
    protected Document document;
    protected String outputASName;
    protected String encoding;
    protected URL internalSplitListURL;
    protected URL externalSplitListURL;
    protected URL nonSplitListURL;
    protected Pattern internalSplitsPattern;
    protected Pattern externalSplitsPattern;
    protected Pattern nonSplitsPattern;

    /* loaded from: input_file:WEB-INF/lib/gate-core-6.1.jar:gate/creole/splitter/RegexSentenceSplitter$MatchResultComparator.class */
    private class MatchResultComparator implements Comparator<MatchResult> {
        private MatchResultComparator() {
        }

        @Override // java.util.Comparator
        public int compare(MatchResult matchResult, MatchResult matchResult2) {
            if (matchResult == null && matchResult2 == null) {
                return 0;
            }
            if (matchResult == null) {
                return 1;
            }
            if (matchResult2 == null) {
                return -1;
            }
            return matchResult.start() - matchResult2.start();
        }
    }

    protected Pattern compilePattern(URL url, String str) throws UnsupportedEncodingException, IOException {
        BomStrippingInputStreamReader bomStrippingInputStreamReader = new BomStrippingInputStreamReader(url.openStream(), str);
        StringBuffer stringBuffer = new StringBuffer();
        String readLine = bomStrippingInputStreamReader.readLine();
        while (true) {
            String str2 = readLine;
            if (str2 == null) {
                return Pattern.compile(stringBuffer.toString());
            }
            String trim = str2.trim();
            if (trim.length() != 0 && !trim.startsWith("//")) {
                if (stringBuffer.length() > 0) {
                    stringBuffer.append("|");
                }
                stringBuffer.append("(?:" + trim + ")");
            }
            readLine = bomStrippingInputStreamReader.readLine();
        }
    }

    @Override // gate.creole.AbstractProcessingResource, gate.Executable
    public void execute() throws ExecutionException {
        this.interrupted = false;
        int i = 0;
        fireProgressChanged(0);
        AnnotationSet annotations = (this.outputASName == null || this.outputASName.trim().length() == 0) ? this.document.getAnnotations() : this.document.getAnnotations(this.outputASName);
        String obj = this.document.getContent().toString();
        if (obj.trim().length() < 1) {
            return;
        }
        Matcher matcher = this.internalSplitsPattern.matcher(obj);
        Matcher matcher2 = this.externalSplitsPattern.matcher(obj);
        Matcher matcher3 = this.nonSplitsPattern.matcher(obj);
        LinkedList linkedList = new LinkedList();
        while (matcher3.find()) {
            linkedList.add(new int[]{matcher3.start(), matcher3.end()});
        }
        ArrayList arrayList = new ArrayList();
        MatchResult matchResult = null;
        if (matcher.find()) {
            matchResult = matcher.toMatchResult();
            arrayList.add(matchResult);
        }
        MatchResult matchResult2 = null;
        if (matcher2.find()) {
            matchResult2 = matcher2.toMatchResult();
            arrayList.add(matchResult2);
        }
        MatchResultComparator matchResultComparator = new MatchResultComparator();
        int i2 = 0;
        while (!arrayList.isEmpty()) {
            Collections.sort(arrayList, matchResultComparator);
            MatchResult matchResult3 = (MatchResult) arrayList.remove(0);
            if (matchResult3 == matchResult) {
                if (!veto(matchResult3, linkedList)) {
                    try {
                        FeatureMap newFeatureMap = Factory.newFeatureMap();
                        newFeatureMap.put(ANNIEConstants.TOKEN_KIND_FEATURE_NAME, "internal");
                        annotations.add(new Long(matchResult3.start()), new Long(matchResult3.end()), "Split", newFeatureMap);
                        int end = matchResult3.end();
                        while (i2 < end && Character.isWhitespace(Character.codePointAt(obj, i2))) {
                            i2++;
                        }
                        if (i2 < matchResult3.start()) {
                            annotations.add(new Long(i2), new Long(end), ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
                        }
                        i2 = end;
                    } catch (InvalidOffsetException e) {
                        throw new ExecutionException(e);
                    }
                }
                if (matcher.find()) {
                    matchResult = matcher.toMatchResult();
                    arrayList.add(matchResult);
                } else {
                    matchResult = null;
                }
            } else {
                if (matchResult3 != matchResult2) {
                    throw new ExecutionException("Invalid state - cannot identify match!");
                }
                if (!veto(matchResult3, linkedList)) {
                    try {
                        FeatureMap newFeatureMap2 = Factory.newFeatureMap();
                        newFeatureMap2.put(ANNIEConstants.TOKEN_KIND_FEATURE_NAME, "external");
                        annotations.add(new Long(matchResult3.start()), new Long(matchResult3.end()), "Split", newFeatureMap2);
                        int start = matchResult3.start();
                        while (start > i2 && Character.isSpaceChar(Character.codePointAt(obj, start - 1))) {
                            start--;
                        }
                        while (i2 < start && Character.isSpaceChar(Character.codePointAt(obj, i2))) {
                            i2++;
                        }
                        if (i2 < start) {
                            annotations.add(new Long(i2), new Long(start), ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
                        }
                        i2 = matchResult3.end();
                    } catch (InvalidOffsetException e2) {
                        throw new ExecutionException(e2);
                    }
                }
                if (matcher2.find()) {
                    matchResult2 = matcher2.toMatchResult();
                    arrayList.add(matchResult2);
                } else {
                    matchResult2 = null;
                }
            }
            int length = (100 * i2) / obj.length();
            if (length - i > 20) {
                i = length;
                fireProgressChanged(i);
            }
        }
        fireProcessFinished();
    }

    private boolean veto(MatchResult matchResult, List<int[]> list) {
        Iterator<int[]> it = list.iterator();
        while (it.hasNext()) {
            int[] next = it.next();
            if (next[1] - 1 >= matchResult.start()) {
                return matchResult.end() - 1 >= next[0];
            }
            it.remove();
        }
        return false;
    }

    @Override // gate.creole.AbstractProcessingResource, gate.creole.AbstractResource, gate.Resource
    public Resource init() throws ResourceInstantiationException {
        super.init();
        try {
            if (this.internalSplitListURL == null) {
                throw new ResourceInstantiationException("No list of internal splits provided!");
            }
            if (this.externalSplitListURL == null) {
                throw new ResourceInstantiationException("No list of external splits provided!");
            }
            if (this.nonSplitListURL == null) {
                throw new ResourceInstantiationException("No list of non splits provided!");
            }
            if (this.encoding == null) {
                throw new ResourceInstantiationException("No encoding provided!");
            }
            this.internalSplitsPattern = compilePattern(this.internalSplitListURL, this.encoding);
            this.externalSplitsPattern = compilePattern(this.externalSplitListURL, this.encoding);
            this.nonSplitsPattern = compilePattern(this.nonSplitListURL, this.encoding);
            return this;
        } catch (UnsupportedEncodingException e) {
            throw new ResourceInstantiationException(e);
        } catch (IOException e2) {
            throw new ResourceInstantiationException(e2);
        }
    }

    @Override // gate.creole.AbstractLanguageAnalyser, gate.LanguageAnalyser
    public Document getDocument() {
        return this.document;
    }

    @Override // gate.creole.AbstractLanguageAnalyser, gate.LanguageAnalyser
    public void setDocument(Document document) {
        this.document = document;
    }

    public String getOutputASName() {
        return this.outputASName;
    }

    public void setOutputASName(String str) {
        this.outputASName = str;
    }

    public String getEncoding() {
        return this.encoding;
    }

    public void setEncoding(String str) {
        this.encoding = str;
    }

    public URL getInternalSplitListURL() {
        return this.internalSplitListURL;
    }

    public void setInternalSplitListURL(URL url) {
        this.internalSplitListURL = url;
    }

    public URL getExternalSplitListURL() {
        return this.externalSplitListURL;
    }

    public void setExternalSplitListURL(URL url) {
        this.externalSplitListURL = url;
    }

    public URL getNonSplitListURL() {
        return this.nonSplitListURL;
    }

    public void setNonSplitListURL(URL url) {
        this.nonSplitListURL = url;
    }

    public Pattern getInternalSplitsPattern() {
        return this.internalSplitsPattern;
    }

    public void setInternalSplitsPattern(Pattern pattern) {
        this.internalSplitsPattern = pattern;
    }
}
