Package org.apache.tika.parser.microsoft
Class WordExtractor
- java.lang.Object
-
- org.apache.tika.parser.microsoft.WordExtractor
-
public class WordExtractor extends Object
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static classWordExtractor.TagAndStyle
-
Field Summary
Fields Modifier and Type Field Description protected org.apache.tika.parser.ParseContextcontextprotected OfficeParserConfigofficeParserConfigprotected org.apache.tika.metadata.MetadataparentMetadata
-
Constructor Summary
Constructors Constructor Description WordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata)
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Modifier and Type Method Description static WordExtractor.TagAndStylebuildParagraphTagAndStyle(String styleName, boolean isTable)Given a style name, return what tag should be used, and what style should be applied to it.protected org.apache.tika.detect.DetectorgetDetector()protected StringgetPassword()Returns the password to be used for this file, or null if no / default password should be usedprotected org.apache.tika.config.TikaConfiggetTikaConfig()protected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.metadata.Metadata metadata, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)Handle an office document that's embedded at the POIFS levelprotected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)protected voidhandleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)protected voidparse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml)protected voidparse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml)protected voidparseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml)protected voidparseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml)static StringtryToGetMsgTitle(org.apache.poi.poifs.filesystem.DirectoryEntry node, String defaultVal)
-
-
-
Field Detail
-
parentMetadata
protected final org.apache.tika.metadata.Metadata parentMetadata
-
officeParserConfig
protected final OfficeParserConfig officeParserConfig
-
context
protected final org.apache.tika.parser.ParseContext context
-
-
Method Detail
-
buildParagraphTagAndStyle
public static WordExtractor.TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable)
Given a style name, return what tag should be used, and what style should be applied to it.
-
parse
protected void parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parse
protected void parse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
parseWord6
protected void parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException- Throws:
IOExceptionSAXException
-
getTikaConfig
protected org.apache.tika.config.TikaConfig getTikaConfig()
-
getDetector
protected org.apache.tika.detect.Detector getDetector()
-
getPassword
protected String getPassword()
Returns the password to be used for this file, or null if no / default password should be used
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedResource
protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaExceptionHandle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaExceptionHandle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
handleEmbeddedOfficeDoc
protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.metadata.Metadata metadata, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaExceptionHandle an office document that's embedded at the POIFS level- Throws:
IOExceptionSAXExceptionorg.apache.tika.exception.TikaException
-
-