package org.ow2.weblab.service.normaliser.tika.handlers;

import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.ow2.weblab.core.extended.factory.MediaUnitFactory;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.service.normaliser.tika.TikaConfiguration;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

/* loaded from: input_file:WEB-INF/classes/org/ow2/weblab/service/normaliser/tika/handlers/SimpleTextContentHandler.class */
public class SimpleTextContentHandler extends WebLabHandlerDecorator {
    private static final List<String> NEWLINE_ELEMENTS = Arrays.asList("ol", "dt", "dl", "ul", "li", "br", "p", "div", "table", "tr");
    private static final List<String> TAB_ELEMENTS = Arrays.asList("td");
    private Document document;
    private Text createdText;
    private boolean isInBody = false;
    private final StringBuilder sb = new StringBuilder();
    private final Log logger = LogFactory.getLog(getClass());

    public SimpleTextContentHandler() {
        this.logger.debug("SimpleTextContentHandler initialised.");
    }

    @Override // org.apache.tika.sax.ContentHandlerDecorator, org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        super.startElement(str, str2, str3, attributes);
        this.logger.trace("Start element: " + str3);
        if ("body".equalsIgnoreCase(str3)) {
            this.isInBody = true;
        } else if (this.isInBody && NEWLINE_ELEMENTS.contains(str3)) {
            this.sb.append("\n");
        }
    }

    @Override // org.apache.tika.sax.ContentHandlerDecorator, org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        super.endElement(str, str2, str3);
        this.logger.trace("End element: " + str3);
        if ("body".equalsIgnoreCase(str3)) {
            this.createdText.setContent(this.sb.toString().trim());
            this.isInBody = false;
            if (this.createdText.getContent().trim().isEmpty()) {
                this.logger.warn("The MediaUnit " + this.createdText.getUri() + " will be removed since it is empty.");
                this.document.getMediaUnit().remove(this.createdText);
                return;
            }
            return;
        }
        if (this.isInBody && NEWLINE_ELEMENTS.contains(str3)) {
            this.sb.append("\n");
            return;
        }
        if (this.isInBody && TAB_ELEMENTS.contains(str3)) {
            this.sb.append("\t");
        } else if (this.isInBody) {
            this.sb.append(" ");
        }
    }

    @Override // org.apache.tika.sax.ContentHandlerDecorator, org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        super.characters(cArr, i, i2);
        if (this.isInBody) {
            String trim = new String(Arrays.copyOfRange(cArr, i, i + i2)).replaceAll("\\s+", " ").trim();
            if (trim.isEmpty()) {
                return;
            }
            this.sb.append(trim);
        }
    }

    @Override // org.ow2.weblab.service.normaliser.tika.handlers.WebLabHandlerDecorator
    public void setDocument(Document document) {
        this.document = document;
        this.createdText = (Text) MediaUnitFactory.createAndLinkMediaUnit(this.document, Text.class);
    }

    @Override // org.ow2.weblab.service.normaliser.tika.handlers.WebLabHandlerDecorator
    public void setTikaConfiguration(TikaConfiguration tikaConfiguration) {
    }
}
