package org.ow2.weblab.service.normaliser.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import javax.jws.WebService;
import javax.xml.bind.DatatypeConverter;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.ow2.weblab.content.api.ContentManager;
import org.ow2.weblab.content.impl.FileContentManager;
import org.ow2.weblab.core.extended.exception.WebLabCheckedException;
import org.ow2.weblab.core.extended.factory.AnnotationFactory;
import org.ow2.weblab.core.extended.ontologies.DCTerms;
import org.ow2.weblab.core.extended.ontologies.WebLabProcessing;
import org.ow2.weblab.core.extended.util.ResourceUtil;
import org.ow2.weblab.core.helper.impl.JenaPoKHelper;
import org.ow2.weblab.core.model.Annotation;
import org.ow2.weblab.core.model.Document;
import org.ow2.weblab.core.model.Resource;
import org.ow2.weblab.core.model.Text;
import org.ow2.weblab.core.services.Analyser;
import org.ow2.weblab.core.services.ContentNotAvailableException;
import org.ow2.weblab.core.services.InvalidParameterException;
import org.ow2.weblab.core.services.UnexpectedException;
import org.ow2.weblab.core.services.analyser.ProcessArgs;
import org.ow2.weblab.core.services.analyser.ProcessReturn;
import org.ow2.weblab.rdf.Value;
import org.ow2.weblab.service.normaliser.tika.handlers.WebLabHandlerDecorator;
import org.ow2.weblab.service.normaliser.tika.metadatawriter.MetadataWriter;
import org.purl.dc.elements.DublinCoreAnnotator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

@WebService(endpointInterface = "org.ow2.weblab.core.services.Analyser")
/* loaded from: input_file:WEB-INF/classes/org/ow2/weblab/service/normaliser/tika/TikaExtractorService.class */
public class TikaExtractorService implements Analyser {
    protected final Log logger = LogFactory.getLog(getClass());
    protected final ContentManager contentManager = ContentManager.getInstance();
    protected final TikaConfiguration serviceConfig;
    protected final TikaConfig tikaConfig;
    protected final boolean removeContent;
    protected final DateFormat simpleDateFormat;
    protected MetadataWriter metadataWriter;

    public TikaExtractorService(TikaConfiguration tikaConfiguration) throws TikaException, IOException {
        this.serviceConfig = tikaConfiguration;
        this.removeContent = !(this.contentManager.getReader() instanceof FileContentManager) && this.serviceConfig.isRemoveTempContent();
        this.simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
        if (this.serviceConfig.getPathToXmlConfigurationFile() == null) {
            this.logger.debug(Messages.getString(Constants.KEY_DEBUG_DEFAULT_TIKA_CONFIG));
            this.tikaConfig = new TikaConfig();
        } else {
            try {
                this.tikaConfig = new TikaConfig(getClass().getClassLoader().getResource(this.serviceConfig.getPathToXmlConfigurationFile()));
            } catch (SAXException e) {
                throw new IOException(e);
            }
        }
        if (this.contentManager == null) {
            this.logger.fatal(Messages.getString(Constants.KEY_ERROR_UNABLE_TO_LOAD_CONTENT_MANAGER));
            throw new IOException(Messages.getString(Constants.KEY_ERROR_UNABLE_TO_LOAD_CONTENT_MANAGER));
        }
        if (!(this.tikaConfig.getParser() instanceof CompositeParser)) {
            this.logger.warn(Messages.getString(Constants.KEY_WARN_NOT_A_COMPOSITE_PARSER_1, this.tikaConfig.getParser().getClass().getCanonicalName()));
        }
        try {
            this.metadataWriter = tikaConfiguration.getMetadataWriterClass().newInstance();
            this.logger.info(Messages.getString(Constants.KEY_INFO_SERVICE_STARTED));
        } catch (IllegalAccessException e2) {
            this.logger.fatal("Failed to initialize the metadataWriter field");
            throw new IOException("Failed to initialize the metadataWriter field");
        } catch (InstantiationException e3) {
            this.logger.fatal("Failed to initialize the metadataWriter field");
            throw new IOException("Failed to initialize the metadataWriter field");
        }
    }

    @Override // org.ow2.weblab.core.services.Analyser
    public ProcessReturn process(ProcessArgs processArgs) throws InvalidParameterException, ContentNotAvailableException, UnexpectedException {
        this.logger.trace("Process method called.");
        Document checkArgs = checkArgs(processArgs);
        this.logger.info(Messages.getString(Constants.KEY_INFO_PROCESS_DOCUMENT_1, checkArgs.getUri()));
        File content = getContent(checkArgs);
        Metadata extractTextAndMetadata = extractTextAndMetadata(checkArgs, content, false);
        if (ResourceUtil.getSelectedSubResources(checkArgs, Text.class).isEmpty()) {
            this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_TEXT_FOUND_2, content.getAbsolutePath(), checkArgs.getUri()));
            extractTextAndMetadata = extractTextAndMetadata(checkArgs, content, true);
        }
        if (this.serviceConfig.isAddMetadata()) {
            Annotation createAndLinkAnnotation = AnnotationFactory.createAndLinkAnnotation(checkArgs);
            JenaPoKHelper jenaPoKHelper = new JenaPoKHelper(createAndLinkAnnotation);
            jenaPoKHelper.setAutoCommitMode(false);
            try {
                this.metadataWriter.write(extractTextAndMetadata, jenaPoKHelper, new URI(checkArgs.getUri()));
                if (this.serviceConfig.getServiceUri() != null) {
                    jenaPoKHelper.createResStat(createAndLinkAnnotation.getUri(), "http://weblab.ow2.org/core/1.2/ontology/processing#isProducedBy", this.serviceConfig.getServiceUri());
                    jenaPoKHelper.createLitStat(createAndLinkAnnotation.getUri(), DCTerms.CREATED, DatatypeConverter.printDateTime(Calendar.getInstance()));
                    jenaPoKHelper.setNSPrefix(DCTerms.PREFERRED_PREFIX, "http://purl.org/dc/terms/");
                    jenaPoKHelper.setNSPrefix(WebLabProcessing.PREFERRED_PREFIX, "http://weblab.ow2.org/core/1.2/ontology/processing#");
                } else {
                    if (0 != 0) {
                        jenaPoKHelper.setNSPrefix(DCTerms.PREFERRED_PREFIX, "http://purl.org/dc/terms/");
                    }
                    if (0 != 0) {
                        jenaPoKHelper.setNSPrefix(WebLabProcessing.PREFERRED_PREFIX, "http://weblab.ow2.org/core/1.2/ontology/processing#");
                    }
                }
                if (0 != 0) {
                    jenaPoKHelper.setNSPrefix(this.serviceConfig.getUnmappedPropertiesPrefix(), this.serviceConfig.getUnmappedPropertiesBaseUri());
                }
                jenaPoKHelper.commit();
            } catch (URISyntaxException e) {
                this.logger.error("Document URI is not a valid : " + e.getLocalizedMessage());
            }
        }
        if (this.removeContent && !content.delete()) {
            this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_DELETE_TEMP_2, content.getAbsolutePath(), checkArgs.getUri()));
        }
        ProcessReturn processReturn = new ProcessReturn();
        processReturn.setResource(checkArgs);
        this.logger.info(Messages.getString(Constants.KEY_INFO_END_OF_PROCESS_1, checkArgs.getUri()));
        return processReturn;
    }

    protected Document checkArgs(ProcessArgs processArgs) throws InvalidParameterException {
        if (processArgs == null) {
            String string = Messages.getString(Constants.KEY_ERROR_PROCESSARGS_NULL);
            this.logger.error(string);
            throw new InvalidParameterException(string, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
        }
        Resource resource = processArgs.getResource();
        if (resource == null) {
            String string2 = Messages.getString(Constants.KEY_ERROR_RESOURCE_NULL);
            this.logger.error(string2);
            throw new InvalidParameterException(string2, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
        }
        if (resource instanceof Document) {
            return (Document) resource;
        }
        String string3 = Messages.getString(Constants.KEY_ERROR_NOT_A_DOCUMENT_2, resource.getUri(), resource.getClass().getCanonicalName());
        this.logger.error(string3);
        throw new InvalidParameterException(string3, Messages.getString(Constants.KEY_ERROR_INVALID_PARAM));
    }

    private File getContent(Document document) throws ContentNotAvailableException {
        try {
            File readNativeContent = this.contentManager.readNativeContent(document);
            if (!readNativeContent.exists()) {
                String string = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_FOUND_2, readNativeContent.getAbsolutePath(), document.getUri());
                this.logger.error(string);
                throw new ContentNotAvailableException(string, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
            }
            if (readNativeContent.canRead()) {
                return readNativeContent;
            }
            String string2 = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_READABLE_2, readNativeContent.getAbsolutePath(), document.getUri());
            this.logger.error(string2);
            throw new ContentNotAvailableException(string2, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
        } catch (WebLabCheckedException e) {
            String string3 = Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_1, document.getUri());
            this.logger.error(string3, e);
            throw new ContentNotAvailableException(string3, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE), e);
        }
    }

    protected Metadata extractTextAndMetadata(Document document, File file, boolean z) throws UnexpectedException, ContentNotAvailableException {
        String str;
        Parser parser;
        File file2;
        DefaultHandler mUCreatorCHandler;
        if (z) {
            str = null;
        } else {
            Value<String> readFormat = new DublinCoreAnnotator(document).readFormat();
            if (readFormat == null || !readFormat.hasValue()) {
                str = null;
            } else {
                str = readFormat.firstTypedValue();
                if (readFormat.getValues().size() > 1) {
                    this.logger.warn(Messages.getString(Constants.KEY_WARN_MORE_THAN_ONE_TYPE_2, document.getUri(), str));
                }
            }
            this.logger.debug("Mime type detected in Resource: " + str);
        }
        if (str == null) {
            parser = new AutoDetectParser(this.tikaConfig);
        } else if (this.tikaConfig.getParser() instanceof CompositeParser) {
            CompositeParser compositeParser = (CompositeParser) this.tikaConfig.getParser();
            MediaType parse = MediaType.parse(str);
            if (compositeParser.getParsers().containsKey(parse)) {
                parser = compositeParser.getParsers().get(parse);
            } else {
                this.logger.debug("No parser for type " + parse + " let Tika guess type.");
                parser = new AutoDetectParser(this.tikaConfig);
            }
        } else {
            parser = this.tikaConfig.getParser();
            this.logger.debug("Tika Config does not use an AutodetectParser but a " + parser.getClass().getCanonicalName() + ".");
        }
        ProfilingHandler profilingHandler = new ProfilingHandler();
        boolean isGenerateHtml = this.serviceConfig.isGenerateHtml();
        try {
            file2 = File.createTempFile(TikaConfiguration.UNMAPPED_PROPERTIES_PREFIX, ".xhtml");
        } catch (IOException e) {
            this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TEMP_FILE_1, document.getUri()), e);
            file2 = new File(FileUtils.getTempDirectory(), "noFile");
            isGenerateHtml = false;
        }
        if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && isGenerateHtml) {
            this.logger.trace("Create a TeeContentHandler for language guesser, MediaUnit creation and XHTML output creation.");
            try {
                mUCreatorCHandler = new TeeContentHandler(getMUCreatorCHandler(document), profilingHandler, getHtmlCreatorCHandler(file2));
            } catch (TransformerConfigurationException e2) {
                this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TRANSFORMER_1, document.getUri()), e2);
                isGenerateHtml = false;
                mUCreatorCHandler = new TeeContentHandler(getMUCreatorCHandler(document), profilingHandler);
            }
        } else if (isGenerateHtml) {
            this.logger.trace("Create a TeeContentHandler for MediaUnit creation and XHTML output creation.");
            try {
                mUCreatorCHandler = new TeeContentHandler(getMUCreatorCHandler(document), getHtmlCreatorCHandler(file2));
            } catch (TransformerConfigurationException e3) {
                this.logger.warn(Messages.getString(Constants.KEY_WARN_UNABLE_TO_CREATE_TRANSFORMER_1, document.getUri()), e3);
                isGenerateHtml = false;
                mUCreatorCHandler = new TeeContentHandler(getMUCreatorCHandler(document));
            }
        } else if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang()) {
            this.logger.trace("Create a TeeContentHandler for language guesser and MediaUnit creation.");
            mUCreatorCHandler = new TeeContentHandler(getMUCreatorCHandler(document), profilingHandler);
        } else {
            this.logger.trace("Create a ContentHandler for MediaUnit creation.");
            mUCreatorCHandler = getMUCreatorCHandler(document);
        }
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        try {
            FileInputStream fileInputStream = new FileInputStream(file);
            this.logger.debug("Start parsing " + file.getPath() + " for document " + document.getUri() + ".");
            try {
                try {
                    try {
                        parser.parse(fileInputStream, mUCreatorCHandler, metadata, parseContext);
                        IOUtils.closeQuietly((InputStream) fileInputStream);
                        this.logger.debug("Finished parsing " + file.getPath() + " for document " + document.getUri() + ".");
                        if (this.serviceConfig.isAddMetadata() && this.serviceConfig.isAnnotateDocumentWithLang() && profilingHandler.getLanguage().isReasonablyCertain()) {
                            metadata.set("language", profilingHandler.getLanguage().getLanguage());
                        } else if (this.serviceConfig.isAnnotateDocumentWithLang() && this.serviceConfig.getDefaultLang() != null) {
                            metadata.set("language", this.serviceConfig.getDefaultLang());
                        }
                        if (isGenerateHtml) {
                            if (!file2.exists()) {
                                this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_OUTPUT_FILE_2, file2.getPath(), document.getUri()));
                            } else if (FileUtils.sizeOf(file2) <= 0) {
                                this.logger.warn(Messages.getString(Constants.KEY_WARN_EMPTY_OUTPUT_FILE_2, file2.getPath(), document.getUri()));
                            } else {
                                try {
                                    fileInputStream = new FileInputStream(file2);
                                    this.logger.debug("Save normalised content file: " + file2);
                                    try {
                                        try {
                                            this.contentManager.writeNormalisedContent(fileInputStream, document);
                                            IOUtils.closeQuietly((InputStream) fileInputStream);
                                        } finally {
                                        }
                                    } catch (WebLabCheckedException e4) {
                                        this.logger.warn(Messages.getString(Constants.KEY_WARN_ERROR_SAVING_NORMALISED_2, file2.getPath(), document.getUri()), e4);
                                        IOUtils.closeQuietly((InputStream) fileInputStream);
                                    }
                                } catch (FileNotFoundException e5) {
                                    this.logger.warn(Messages.getString(Constants.KEY_WARN_NO_OUTPUT_FILE_2, file2.getPath(), document.getUri()), e5);
                                }
                            }
                        }
                        FileUtils.deleteQuietly(file2);
                        return this.serviceConfig.isAddMetadata() ? metadata : new Metadata();
                    } finally {
                    }
                } catch (IOException e6) {
                    String string = Messages.getString(Constants.KEY_ERROR_IOE_ON_CONTENT_2, file.getPath(), document.getUri());
                    this.logger.error(string, e6);
                    throw new UnexpectedException(string, Messages.getString(Constants.KEY_ERROR_IOE_ON_CONTENT_SIMPLE), e6);
                }
            } catch (TikaException e7) {
                String string2 = Messages.getString(Constants.KEY_ERROR_TIKA_EX_ON_CONTENT_2, file.getPath(), document.getUri());
                this.logger.error(string2, e7);
                throw new UnexpectedException(string2, Messages.getString(Constants.KEY_ERROR_ERROR_ON_CONTENT_SIMPLE), e7);
            } catch (SAXException e8) {
                String string3 = Messages.getString(Constants.KEY_ERROR_SAXE_ON_CONTENT_2, file.getPath(), document.getUri());
                this.logger.error(string3, e8);
                throw new UnexpectedException(string3, Messages.getString(Constants.KEY_ERROR_ERROR_ON_CONTENT_SIMPLE), e8);
            }
        } catch (FileNotFoundException e9) {
            String string4 = Messages.getString(Constants.KEY_ERROR_CONTENT_FILE_NOT_FOUND_2, file.getAbsolutePath(), document.getUri());
            this.logger.error(string4);
            throw new ContentNotAvailableException(string4, Messages.getString(Constants.KEY_ERROR_CONTENT_NOT_AVAILABLE_SIMPLE));
        }
    }

    private WebLabHandlerDecorator getMUCreatorCHandler(Document document) throws UnexpectedException {
        try {
            WebLabHandlerDecorator newInstance = this.serviceConfig.getWebLabHandlerDecoratorClass().newInstance();
            newInstance.setDocument(document);
            newInstance.setTikaConfiguration(this.serviceConfig);
            newInstance.setContentHandler(new BodyContentHandler(-1));
            return newInstance;
        } catch (IllegalAccessException e) {
            String string = Messages.getString(Constants.KEY_ERROR_BAD_HANDLER_1, this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName());
            this.logger.error(string, e);
            throw new UnexpectedException(string, string, e);
        } catch (InstantiationException e2) {
            String string2 = Messages.getString(Constants.KEY_ERROR_BAD_HANDLER_1, this.serviceConfig.getWebLabHandlerDecoratorClass().getCanonicalName());
            this.logger.error(string2, e2);
            throw new UnexpectedException(string2, string2, e2);
        }
    }

    private ContentHandler getHtmlCreatorCHandler(File file) throws TransformerConfigurationException {
        TransformerHandler newTransformerHandler = ((SAXTransformerFactory) TransformerFactory.newInstance()).newTransformerHandler();
        newTransformerHandler.getTransformer().setOutputProperty("method", "xml");
        newTransformerHandler.getTransformer().setOutputProperty("indent", "yes");
        newTransformerHandler.setResult(new StreamResult(file));
        return newTransformerHandler;
    }
}
