package gate.corpora;

import gate.Document;
import gate.DocumentFormat;
import gate.Resource;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleResource;
import gate.util.DocumentFormatException;
import java.io.InputStream;
import java.net.URL;
import org.textmining.text.extraction.WordExtractor;

@CreoleResource(name = "GATE MSWord Document Format", isPrivate = true, autoinstances = {@AutoInstance(hidden = true)})
/* loaded from: input_file:gate/corpora/MSWordDocumentFormat.class */
public class MSWordDocumentFormat extends DocumentFormat {
    @Override // gate.creole.AbstractResource, gate.Resource
    public Resource init() throws ResourceInstantiationException {
        MimeType mimeType = new MimeType("application", " msword");
        mimeString2ClassHandlerMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), this);
        mimeString2mimeTypeMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), mimeType);
        suffixes2mimeTypeMap.put("doc", mimeType);
        setMimeType(mimeType);
        return this;
    }

    @Override // gate.DocumentFormat
    public Boolean supportsRepositioning() {
        return new Boolean(false);
    }

    @Override // gate.DocumentFormat
    public void unpackMarkup(Document document) throws DocumentFormatException {
        URL sourceUrl = document.getSourceUrl();
        if (sourceUrl == null) {
            throw new DocumentFormatException("Unpacking MS Word files requires an URL to the original content!");
        }
        try {
            InputStream openStream = sourceUrl.openStream();
            String extractText = new WordExtractor().extractText(openStream);
            openStream.close();
            document.setContent(new DocumentContentImpl(extractText));
        } catch (Exception e) {
            throw new DocumentFormatException("Exception for " + document.getSourceUrl().toExternalForm(), e);
        }
    }

    @Override // gate.DocumentFormat
    public void unpackMarkup(Document document, RepositioningInfo repositioningInfo, RepositioningInfo repositioningInfo2) throws DocumentFormatException {
        unpackMarkup(document);
    }
}
