package gate.html;

import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.GateConstants;
import gate.corpora.DocumentContentImpl;
import gate.corpora.RepositioningInfo;
import gate.event.StatusListener;
import gate.util.Err;
import gate.util.InvalidOffsetException;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.NamespaceContext;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLResourceIdentifier;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.apache.xerces.xni.parser.XMLParseException;
import org.cyberneko.html.HTMLEventInfo;

/* loaded from: input_file:WEB-INF/lib/gate-core-6.1.jar:gate/html/NekoHtmlDocumentHandler.class */
public class NekoHtmlDocumentHandler implements XMLDocumentHandler, XMLErrorHandler {
    private static final boolean DEBUG = false;
    private static final boolean DEBUG_GENERAL = false;
    private static final boolean DEBUG_ELEMENTS = false;
    private static final boolean DEBUG_CHARACTERS = false;
    private static final boolean DEBUG_UNUSED = false;
    public static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
    private static final Comparator<Object> POSITION_INFO_COMPARATOR = new Comparator<Object>() { // from class: gate.html.NekoHtmlDocumentHandler.1
        @Override // java.util.Comparator
        public int compare(Object obj, Object obj2) {
            Long l = null;
            if (obj instanceof Long) {
                l = (Long) obj;
            } else if (obj instanceof RepositioningInfo.PositionInfo) {
                l = Long.valueOf(((RepositioningInfo.PositionInfo) obj).getOriginalPosition());
            }
            Long l2 = null;
            if (obj2 instanceof Long) {
                l2 = (Long) obj2;
            } else if (obj2 instanceof RepositioningInfo.PositionInfo) {
                l2 = Long.valueOf(((RepositioningInfo.PositionInfo) obj).getOriginalPosition());
            }
            return l.compareTo(l2);
        }
    };
    private Set<String> ignorableTags;
    static final int ELEMENTS_RATE = 128;
    private int[] lineOffsets;
    private StringBuilder tmpDocContent;
    private int charactersStartOffset;
    private Stack<CustomObject> stack;
    private Document doc;
    private AnnotationSet basicAS;
    protected int customObjectsId;
    private LinkedList<CustomObject> colector;
    protected boolean addSpaceOnUnpack;
    private RepositioningInfo reposInfo = null;
    private RepositioningInfo ampCodingInfo = null;
    int ignorableTagLevels = 0;
    private StringBuilder contentBuffer = new StringBuilder("");
    private boolean readCharacterStatus = false;
    protected List<StatusListener> myStatusListeners = new LinkedList();
    private int elements = 0;
    protected boolean previousChunkEndedWithWS = false;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:WEB-INF/lib/gate-core-6.1.jar:gate/html/NekoHtmlDocumentHandler$CustomObject.class */
    public class CustomObject implements Comparable<CustomObject> {
        private String elemName;
        private FeatureMap fm;
        private Long start;
        private Long end;
        private Long id;

        public CustomObject(String str, FeatureMap featureMap, Long l, Long l2) {
            this.elemName = null;
            this.fm = null;
            this.start = null;
            this.end = null;
            this.id = null;
            this.elemName = str;
            this.fm = featureMap;
            this.start = l;
            this.end = l2;
            int i = NekoHtmlDocumentHandler.this.customObjectsId;
            NekoHtmlDocumentHandler.this.customObjectsId = i + 1;
            this.id = new Long(i);
        }

        @Override // java.lang.Comparable
        public int compareTo(CustomObject customObject) {
            return this.id.compareTo(customObject.getId());
        }

        public String getElemName() {
            return this.elemName;
        }

        public FeatureMap getFM() {
            return this.fm;
        }

        public Long getStart() {
            return this.start;
        }

        public Long getEnd() {
            return this.end;
        }

        public Long getId() {
            return this.id;
        }

        public void setElemName(String str) {
            this.elemName = str;
        }

        public void setFM(FeatureMap featureMap) {
            this.fm = featureMap;
        }

        public void setStart(Long l) {
            this.start = l;
        }

        public void setEnd(Long l) {
            this.end = l;
        }
    }

    public NekoHtmlDocumentHandler(Document document, AnnotationSet annotationSet, Set<String> set) {
        this.ignorableTags = null;
        this.tmpDocContent = null;
        this.stack = null;
        this.doc = null;
        this.customObjectsId = 0;
        this.colector = null;
        this.addSpaceOnUnpack = true;
        set = set == null ? new HashSet() : set;
        this.stack = new Stack<>();
        this.tmpDocContent = new StringBuilder(document.getContent().size().intValue());
        this.colector = new LinkedList<>();
        this.doc = document;
        this.basicAS = annotationSet;
        this.customObjectsId = 0;
        this.ignorableTags = set;
        if (Gate.getUserConfig().get(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME) != null) {
            this.addSpaceOnUnpack = Gate.getUserConfig().getBoolean(GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME).booleanValue();
        }
    }

    public void setLineOffsets(int[] iArr) {
        this.lineOffsets = iArr;
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void startElement(QName qName, XMLAttributes xMLAttributes, Augmentations augmentations) throws XNIException {
        charactersAction();
        int i = this.elements + 1;
        this.elements = i;
        if (0 == i % 128) {
            fireStatusChangedEvent("Processed elements : " + this.elements);
        }
        if (this.ignorableTags.contains(qName.localpart)) {
            this.ignorableTagLevels++;
        }
        FeatureMap newFeatureMap = Factory.newFeatureMap();
        for (int i2 = 0; i2 < xMLAttributes.getLength(); i2++) {
            newFeatureMap.put(xMLAttributes.getLocalName(i2), xMLAttributes.getValue(i2));
        }
        customizeAppearanceOfDocumentWithStartTag(qName.localpart);
        Long l = new Long(this.tmpDocContent.length());
        this.stack.push(new CustomObject(qName.localpart, newFeatureMap, l, l));
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void characters(XMLString xMLString, Augmentations augmentations) throws XNIException {
        if (!this.readCharacterStatus) {
            if (this.reposInfo != null) {
                HTMLEventInfo hTMLEventInfo = augmentations == null ? null : (HTMLEventInfo) augmentations.getItem(AUGMENTATIONS);
                if (hTMLEventInfo == null) {
                    Err.println("Warning: could not determine proper repositioning info for character chunk \"" + new String(xMLString.ch, xMLString.offset, xMLString.length) + "\" near offset " + this.charactersStartOffset + ".  Save preserving format may give incorret results.");
                } else {
                    this.charactersStartOffset = this.lineOffsets[hTMLEventInfo.getBeginLineNumber() - 1] + (hTMLEventInfo.getBeginColumnNumber() - 1);
                }
            }
            this.contentBuffer = new StringBuilder();
        }
        this.readCharacterStatus = true;
        boolean z = this.contentBuffer.length() == 0 || !Character.isWhitespace(this.contentBuffer.charAt(this.contentBuffer.length() - 1));
        for (int i = xMLString.offset; i < xMLString.offset + xMLString.length; i++) {
            if (!Character.isWhitespace(xMLString.ch[i])) {
                this.contentBuffer.append(xMLString.ch[i]);
                z = true;
            } else if (z) {
                this.contentBuffer.append(' ');
                z = false;
            }
        }
    }

    public void charactersAction() throws XNIException {
        if (this.readCharacterStatus) {
            this.readCharacterStatus = false;
            if (this.contentBuffer.length() != 0 && this.ignorableTagLevels <= 0) {
                boolean isWhitespace = Character.isWhitespace(this.contentBuffer.charAt(0));
                if (isWhitespace) {
                    this.contentBuffer.deleteCharAt(0);
                }
                if (this.contentBuffer.length() == 0) {
                    this.previousChunkEndedWithWS = isWhitespace;
                    return;
                }
                boolean isWhitespace2 = Character.isWhitespace(this.contentBuffer.charAt(this.contentBuffer.length() - 1));
                if (isWhitespace2) {
                    this.contentBuffer.setLength(this.contentBuffer.length() - 1);
                }
                int length = this.tmpDocContent.length();
                boolean z = false;
                if (length != 0 && !Character.isWhitespace(this.tmpDocContent.charAt(length - 1)) && (this.previousChunkEndedWithWS || isWhitespace || this.addSpaceOnUnpack)) {
                    this.tmpDocContent.append(' ');
                    z = true;
                }
                this.tmpDocContent.append((CharSequence) this.contentBuffer);
                if (this.reposInfo != null) {
                    long j = this.charactersStartOffset;
                    if (isWhitespace) {
                        j = fixStartOffsetForWhitespace(j);
                    }
                    int i = length;
                    if (z) {
                        i++;
                    }
                    addRepositioningInfo(this.contentBuffer.length(), (int) j, i);
                }
                Long l = new Long(this.tmpDocContent.length());
                Iterator<CustomObject> it = this.stack.iterator();
                while (it.hasNext()) {
                    CustomObject next = it.next();
                    if (z && next.getStart().equals(next.getEnd())) {
                        next.setStart(new Long(next.getStart().longValue() + 1));
                    }
                    next.setEnd(l);
                }
                this.previousChunkEndedWithWS = isWhitespace2;
            }
        }
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void endElement(QName qName, Augmentations augmentations) throws XNIException {
        endElement(qName, augmentations, false);
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void emptyElement(QName qName, XMLAttributes xMLAttributes, Augmentations augmentations) throws XNIException {
        startElement(qName, xMLAttributes, augmentations);
        endElement(qName, augmentations, true);
    }

    public void endElement(QName qName, Augmentations augmentations, boolean z) throws XNIException {
        charactersAction();
        CustomObject customObject = null;
        if (this.ignorableTags.contains(qName.localpart)) {
            this.ignorableTagLevels--;
        }
        if (!this.stack.isEmpty()) {
            customObject = this.stack.pop();
            if (customObject.getStart().equals(customObject.getEnd()) && !z) {
                customObject.getFM().put("isEmptyAndSpan", "true");
            }
            this.colector.add(customObject);
        }
        if (customObject == null || customObject.getStart().longValue() == customObject.getEnd().longValue()) {
            return;
        }
        customizeAppearanceOfDocumentWithEndTag(qName.localpart);
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void endDocument(Augmentations augmentations) throws XNIException {
        this.doc.setContent(new DocumentContentImpl(this.tmpDocContent.toString()));
        if (this.basicAS == null) {
            this.basicAS = this.doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        }
        Collections.sort(this.colector);
        while (!this.colector.isEmpty()) {
            CustomObject first = this.colector.getFirst();
            this.colector.remove(first);
            try {
                this.basicAS.add(first.getStart(), first.getEnd(), first.getElemName(), first.getFM());
            } catch (InvalidOffsetException e) {
                Err.prln("Error creating an annot :" + first + " Discarded...");
            }
        }
        fireStatusChangedEvent("Total elements : " + this.elements);
    }

    @Override // org.apache.xerces.xni.parser.XMLErrorHandler
    public void error(String str, String str2, XMLParseException xMLParseException) {
        xMLParseException.printStackTrace(Err.getPrintWriter());
    }

    @Override // org.apache.xerces.xni.parser.XMLErrorHandler
    public void fatalError(String str, String str2, XMLParseException xMLParseException) throws XNIException {
        throw xMLParseException;
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void processingInstruction(String str, XMLString xMLString, Augmentations augmentations) throws XNIException {
        charactersAction();
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void comment(XMLString xMLString, Augmentations augmentations) throws XNIException {
        charactersAction();
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void startCDATA(Augmentations augmentations) throws XNIException {
        charactersAction();
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void endCDATA(Augmentations augmentations) throws XNIException {
        charactersAction();
    }

    private long fixStartOffsetForWhitespace(long j) {
        int binarySearch = Collections.binarySearch(this.ampCodingInfo, Long.valueOf(j), POSITION_INFO_COMPARATOR);
        return binarySearch < 0 ? j + 1 : j + ((RepositioningInfo.PositionInfo) this.ampCodingInfo.get(binarySearch)).getOriginalLength();
    }

    public void addRepositioningInfo(int i, int i2, int i3) {
        long j = i2;
        long j2 = 0;
        for (int i4 = 0; i4 < this.ampCodingInfo.size(); i4++) {
            RepositioningInfo.PositionInfo positionInfo = (RepositioningInfo.PositionInfo) this.ampCodingInfo.get(i4);
            long originalPosition = positionInfo.getOriginalPosition();
            if (originalPosition >= j) {
                if (originalPosition > i2 + i + j2) {
                    break;
                }
                long j3 = originalPosition - (j + j2);
                long j4 = j - i2;
                if (j3 > 0) {
                    this.reposInfo.addPositionInfo(j + j2, j3, i3 + j4, j3);
                }
                this.reposInfo.addPositionInfo(originalPosition, positionInfo.getOriginalLength(), i3 + j4 + j3, positionInfo.getCurrentLength());
                j = j + j3 + positionInfo.getCurrentLength();
                j2 += positionInfo.getOriginalLength() - positionInfo.getCurrentLength();
            }
        }
        long j5 = j - i2;
        long j6 = i - j5;
        if (j6 > 0) {
            this.reposInfo.addPositionInfo(j + j2, j6, i3 + j5, j6);
        }
    }

    protected void customizeAppearanceOfDocumentWithStartTag(String str) {
        boolean z = false;
        int length = this.tmpDocContent.length();
        if ("p".equals(str) && length >= 2 && '\n' != this.tmpDocContent.charAt(length - 2)) {
            this.tmpDocContent.append("\n");
            z = true;
        }
        if ("br".equals(str)) {
            this.tmpDocContent.append("\n");
            z = true;
        }
        if ("div".equals(str) && length > 0 && this.tmpDocContent.charAt(length - 1) != '\n') {
            this.tmpDocContent.append("\n");
            z = true;
        }
        if (z) {
            Long l = new Long(this.tmpDocContent.length());
            Iterator<CustomObject> it = this.stack.iterator();
            while (it.hasNext()) {
                it.next().setEnd(l);
            }
        }
    }

    protected void customizeAppearanceOfDocumentWithEndTag(String str) {
        boolean z = false;
        if ("p".equals(str) || "h1".equals(str) || "h2".equals(str) || "h3".equals(str) || "h4".equals(str) || "h5".equals(str) || "h6".equals(str) || "tr".equals(str) || "center".equals(str) || "li".equals(str)) {
            this.tmpDocContent.append("\n");
            z = true;
        }
        if ("div".equals(str) && this.tmpDocContent.length() > 0 && this.tmpDocContent.charAt(this.tmpDocContent.length() - 1) != '\n') {
            this.tmpDocContent.append("\n");
            z = true;
        }
        if ("title".equals(str)) {
            this.tmpDocContent.append("\n\n");
            z = true;
        }
        if (z) {
            Long l = new Long(this.tmpDocContent.length());
            Iterator<CustomObject> it = this.stack.iterator();
            while (it.hasNext()) {
                it.next().setEnd(l);
            }
        }
    }

    public void setRepositioningInfo(RepositioningInfo repositioningInfo) {
        this.reposInfo = repositioningInfo;
    }

    public RepositioningInfo getRepositioningInfo() {
        return this.reposInfo;
    }

    public void setAmpCodingInfo(RepositioningInfo repositioningInfo) {
        this.ampCodingInfo = repositioningInfo;
    }

    public RepositioningInfo getAmpCodingInfo() {
        return this.ampCodingInfo;
    }

    public void setIgnorableTags(Set<String> set) {
        this.ignorableTags = set;
    }

    public Set<String> getIgnorableTags() {
        return this.ignorableTags;
    }

    public int getCustomObjectsId() {
        return this.customObjectsId;
    }

    public void addStatusListener(StatusListener statusListener) {
        this.myStatusListeners.add(statusListener);
    }

    public void removeStatusListener(StatusListener statusListener) {
        this.myStatusListeners.remove(statusListener);
    }

    protected void fireStatusChangedEvent(String str) {
        Iterator<StatusListener> it = this.myStatusListeners.iterator();
        while (it.hasNext()) {
            it.next().statusChanged(str);
        }
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void doctypeDecl(String str, String str2, String str3, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void endGeneralEntity(String str, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public XMLDocumentSource getDocumentSource() {
        return null;
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void ignorableWhitespace(XMLString xMLString, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void setDocumentSource(XMLDocumentSource xMLDocumentSource) {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void startDocument(XMLLocator xMLLocator, String str, NamespaceContext namespaceContext, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void startGeneralEntity(String str, XMLResourceIdentifier xMLResourceIdentifier, String str2, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void textDecl(String str, String str2, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.XMLDocumentHandler
    public void xmlDecl(String str, String str2, String str3, Augmentations augmentations) throws XNIException {
    }

    @Override // org.apache.xerces.xni.parser.XMLErrorHandler
    public void warning(String str, String str2, XMLParseException xMLParseException) throws XNIException {
    }
}
