Class WordExtractor

java.lang.Object
org.apache.tika.parser.microsoft.WordExtractor

public class WordExtractor extends Object
  • Nested Class Summary

    Nested Classes
    Modifier and Type
    Class
    Description
    static class 
     
  • Field Summary

    Fields
    Modifier and Type
    Field
    Description
    protected final org.apache.tika.parser.ParseContext
     
    protected final OfficeParserConfig
     
    protected final org.apache.tika.metadata.Metadata
     
  • Constructor Summary

    Constructors
    Constructor
    Description
    WordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata)
     
  • Method Summary

    Modifier and Type
    Method
    Description
    buildParagraphTagAndStyle(String styleName, boolean isTable)
    Given a style name, return what tag should be used, and what style should be applied to it.
    protected org.apache.tika.detect.Detector
     
    protected String
    Returns the password to be used for this file, or null if no / default password should be used
    protected org.apache.tika.config.TikaConfig
     
    protected void
    handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)
    Handle an office document that's embedded at the POIFS level
    protected void
    handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)
    Handle an office document that's embedded at the POIFS level
    protected void
    handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)
     
    protected void
    handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)
     
    protected void
    handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml)
     
    protected void
    parse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml)
     
    protected void
    parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml)
     
    protected void
    parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml)
     
    protected void
    parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml)
     
    static String
    tryToGetMsgTitle(org.apache.poi.poifs.filesystem.DirectoryEntry node, String defaultVal)
     

    Methods inherited from class java.lang.Object

    clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
  • Field Details

    • parentMetadata

      protected final org.apache.tika.metadata.Metadata parentMetadata
    • officeParserConfig

      protected final OfficeParserConfig officeParserConfig
    • context

      protected final org.apache.tika.parser.ParseContext context
  • Constructor Details

    • WordExtractor

      public WordExtractor(org.apache.tika.parser.ParseContext context, org.apache.tika.metadata.Metadata metadata)
  • Method Details

    • buildParagraphTagAndStyle

      public static WordExtractor.TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable)
      Given a style name, return what tag should be used, and what style should be applied to it.
    • parse

      protected void parse(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • parse

      protected void parse(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • parseWord6

      protected void parseWord6(org.apache.poi.poifs.filesystem.POIFSFileSystem filesystem, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • parseWord6

      protected void parseWord6(org.apache.poi.poifs.filesystem.DirectoryNode root, org.apache.tika.sax.XHTMLContentHandler xhtml) throws IOException, SAXException
      Throws:
      IOException
      SAXException
    • getTikaConfig

      protected org.apache.tika.config.TikaConfig getTikaConfig()
    • getDetector

      protected org.apache.tika.detect.Detector getDetector()
    • getPassword

      protected String getPassword()
      Returns the password to be used for this file, or null if no / default password should be used
    • handleEmbeddedResource

      protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • handleEmbeddedResource

      protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • handleEmbeddedResource

      protected void handleEmbeddedResource(org.apache.tika.io.TikaInputStream resource, org.apache.tika.metadata.Metadata embeddedMetadata, String filename, String relationshipID, org.apache.poi.hpsf.ClassID storageClassID, String mediaType, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • handleEmbeddedOfficeDoc

      protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Handle an office document that's embedded at the POIFS level
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • handleEmbeddedOfficeDoc

      protected void handleEmbeddedOfficeDoc(org.apache.poi.poifs.filesystem.DirectoryEntry dir, String resourceName, org.apache.tika.sax.XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, org.apache.tika.exception.TikaException
      Handle an office document that's embedded at the POIFS level
      Throws:
      IOException
      SAXException
      org.apache.tika.exception.TikaException
    • tryToGetMsgTitle

      public static String tryToGetMsgTitle(org.apache.poi.poifs.filesystem.DirectoryEntry node, String defaultVal)