Package org.apache.tika.parser.microsoft
Class WordExtractor
java.lang.Object
org.apache.tika.parser.microsoft.WordExtractor
-
Nested Class Summary
Nested Classes -
Field Summary
FieldsModifier and TypeFieldDescriptionprotected final org.apache.tika.parser.ParseContextprotected final OfficeParserConfigprotected final org.apache.tika.metadata.Metadata -
Constructor Summary
ConstructorsConstructorDescriptionWordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata) -
Method Summary
Modifier and TypeMethodDescriptionstatic WordExtractor.TagAndStylebuildParagraphTagAndStyle(String styleName, boolean isTable) Given a style name, return what tag should be used, and what style should be applied to it.protected org.apache.tika.detect.Detectorprotected StringReturns the password to be used for this file, or null if no / default password should be usedprotected org.apache.tika.config.TikaConfigprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) protected voidparse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) protected voidparseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) static StringtryToGetMsgTitle(org.apache.poi.poifs.filesystem.DirectoryEntry node, String defaultVal)
-
Field Details
-
parentMetadata
protected final org.apache.tika.metadata.Metadata parentMetadata -
officeParserConfig
-
context
protected final org.apache.tika.parser.ParseContext context
-
-
Constructor Details
-
WordExtractor
public WordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata)
-
-
Method Details
-
buildParagraphTagAndStyle
public static WordExtractor.TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) Given a style name, return what tag should be used, and what style should be applied to it. -
parse
protected void parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parse
protected void parse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException - Throws:
IOExceptionSAXException
-
getTikaConfig
protected org.apache.tika.config.TikaConfig getTikaConfig() -
getDetector
protected org.apache.tika.detect.Detector getDetector() -
getPassword
Returns the password to be used for this file, or null if no / default password should be used -
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException - Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException Handle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException Handle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
tryToGetMsgTitle
-