org.apache.accumulo.core.client.mapreduce
Class AccumuloInputFormat

java.lang.Object
  extended by org.apache.hadoop.mapreduce.InputFormat<Key,Value>
      extended by org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat

public class AccumuloInputFormat
extends org.apache.hadoop.mapreduce.InputFormat<Key,Value>

This class allows MapReduce jobs to use Accumulo as the source of data. This input format provides keys and values of type Key and Value to the Map() and Reduce() functions. The user must specify the following via static methods:

Other static methods are optional


Nested Class Summary
static class AccumuloInputFormat.RangeInputSplit
          The Class RangeInputSplit.
static class AccumuloInputFormat.RegexType
           
 
Constructor Summary
AccumuloInputFormat()
           
 
Method Summary
 org.apache.hadoop.mapreduce.RecordReader<Key,Value> createRecordReader(org.apache.hadoop.mapreduce.InputSplit inSplit, org.apache.hadoop.mapreduce.TaskAttemptContext attempt)
           
static void disableAutoAdjustRanges(org.apache.hadoop.mapreduce.JobContext job)
           
static void fetchColumns(org.apache.hadoop.mapreduce.JobContext job, java.util.Collection<Pair<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text>> columnFamilyColumnQualifierPairs)
           
protected static Authorizations getAuthorizations(org.apache.hadoop.mapreduce.JobContext job)
           
protected static boolean getAutoAdjustRanges(org.apache.hadoop.mapreduce.JobContext job)
           
protected static java.util.Set<Pair<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text>> getFetchedColumns(org.apache.hadoop.mapreduce.JobContext job)
           
protected static Instance getInstance(org.apache.hadoop.mapreduce.JobContext job)
           
protected static java.util.List<org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat.AccumuloIteratorOption> getIteratorOptions(org.apache.hadoop.mapreduce.JobContext job)
           
protected static java.util.List<org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat.AccumuloIterator> getIterators(org.apache.hadoop.mapreduce.JobContext job)
           
protected static org.apache.log4j.Level getLogLevel(org.apache.hadoop.mapreduce.JobContext job)
           
protected static int getMaxVersions(org.apache.hadoop.mapreduce.JobContext job)
           
protected static byte[] getPassword(org.apache.hadoop.mapreduce.JobContext job)
          WARNING: The password is stored in the Configuration and shared with all MapReduce tasks; It is BASE64 encoded to provide a charset safe conversion to a string, and is not intended to be secure.
protected static java.util.List<Range> getRanges(org.apache.hadoop.mapreduce.JobContext job)
           
protected static java.lang.String getRegex(org.apache.hadoop.mapreduce.JobContext job, AccumuloInputFormat.RegexType type)
           
 java.util.List<org.apache.hadoop.mapreduce.InputSplit> getSplits(org.apache.hadoop.mapreduce.JobContext job)
          read the metadata table to get tablets of interest these each become a split
protected static java.lang.String getTablename(org.apache.hadoop.mapreduce.JobContext job)
           
protected static TabletLocator getTabletLocator(org.apache.hadoop.mapreduce.JobContext job)
           
protected static java.lang.String getUsername(org.apache.hadoop.mapreduce.JobContext job)
           
protected static boolean isIsolated(org.apache.hadoop.mapreduce.JobContext job)
           
static void setInputInfo(org.apache.hadoop.mapreduce.JobContext job, java.lang.String user, byte[] passwd, java.lang.String table, Authorizations auths)
           
static void setIsolated(org.apache.hadoop.mapreduce.JobContext job, boolean enable)
          Enable or disable use of the IsolatedScanner.
static void setIterator(org.apache.hadoop.mapreduce.JobContext job, int priority, java.lang.String iteratorClass, java.lang.String iteratorName)
          Specify a Accumulo iterator type to manage the behavior of the underlying table scan this InputFormat's Record Reader will conduct, w/ priority dictating the order in which specified iterators are applied.
static void setIteratorOption(org.apache.hadoop.mapreduce.JobContext job, java.lang.String iteratorName, java.lang.String key, java.lang.String value)
          Specify an option for a named Accumulo iterator, further specifying that iterator's behavior.
static void setLogLevel(org.apache.hadoop.mapreduce.JobContext job, org.apache.log4j.Level level)
           
static void setMaxVersions(org.apache.hadoop.mapreduce.JobContext job, int maxVersions)
          Sets the max # of values that may be returned for an individual Accumulo cell.
static void setMockInstance(org.apache.hadoop.mapreduce.JobContext job, java.lang.String instanceName)
           
static void setRanges(org.apache.hadoop.mapreduce.JobContext job, java.util.Collection<Range> ranges)
           
static void setRegex(org.apache.hadoop.mapreduce.JobContext job, AccumuloInputFormat.RegexType type, java.lang.String regex)
           
static void setZooKeeperInstance(org.apache.hadoop.mapreduce.JobContext job, java.lang.String instanceName, java.lang.String zooKeepers)
           
protected static void validateOptions(org.apache.hadoop.mapreduce.JobContext job)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

AccumuloInputFormat

public AccumuloInputFormat()
Method Detail

setIsolated

public static void setIsolated(org.apache.hadoop.mapreduce.JobContext job,
                               boolean enable)
Enable or disable use of the IsolatedScanner. By default it is not enabled.

Parameters:
job -
enable -

setInputInfo

public static void setInputInfo(org.apache.hadoop.mapreduce.JobContext job,
                                java.lang.String user,
                                byte[] passwd,
                                java.lang.String table,
                                Authorizations auths)

setZooKeeperInstance

public static void setZooKeeperInstance(org.apache.hadoop.mapreduce.JobContext job,
                                        java.lang.String instanceName,
                                        java.lang.String zooKeepers)

setMockInstance

public static void setMockInstance(org.apache.hadoop.mapreduce.JobContext job,
                                   java.lang.String instanceName)

setRanges

public static void setRanges(org.apache.hadoop.mapreduce.JobContext job,
                             java.util.Collection<Range> ranges)

disableAutoAdjustRanges

public static void disableAutoAdjustRanges(org.apache.hadoop.mapreduce.JobContext job)

setRegex

public static void setRegex(org.apache.hadoop.mapreduce.JobContext job,
                            AccumuloInputFormat.RegexType type,
                            java.lang.String regex)

setMaxVersions

public static void setMaxVersions(org.apache.hadoop.mapreduce.JobContext job,
                                  int maxVersions)
                           throws java.io.IOException
Sets the max # of values that may be returned for an individual Accumulo cell. By default, applied before all other Accumulo iterators (highest priority) leveraged in the scan by the record reader. To adjust priority use setIterator() & setIteratorOptions() w/ the VersioningIterator type explicitly.

Parameters:
job - the job
maxVersions - the max versions
Throws:
java.io.IOException

fetchColumns

public static void fetchColumns(org.apache.hadoop.mapreduce.JobContext job,
                                java.util.Collection<Pair<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text>> columnFamilyColumnQualifierPairs)
Parameters:
columnFamilyColumnQualifierPairs - A pair of Text objects corresponding to column family and column qualifier. If the column qualifier is null, the entire column family is selected. An empty set is the default and is equivalent to scanning the all columns.

setLogLevel

public static void setLogLevel(org.apache.hadoop.mapreduce.JobContext job,
                               org.apache.log4j.Level level)

setIterator

public static void setIterator(org.apache.hadoop.mapreduce.JobContext job,
                               int priority,
                               java.lang.String iteratorClass,
                               java.lang.String iteratorName)
Specify a Accumulo iterator type to manage the behavior of the underlying table scan this InputFormat's Record Reader will conduct, w/ priority dictating the order in which specified iterators are applied. Repeat calls to specify multiple iterators are allowed.

Parameters:
job - the job
priority - the priority
iteratorClass - the iterator class
iteratorName - the iterator name

setIteratorOption

public static void setIteratorOption(org.apache.hadoop.mapreduce.JobContext job,
                                     java.lang.String iteratorName,
                                     java.lang.String key,
                                     java.lang.String value)
Specify an option for a named Accumulo iterator, further specifying that iterator's behavior.

Parameters:
job - the job
iteratorName - the iterator name. Should correspond to an iterator set w/ a prior setIterator call.
key - the key
value - the value

isIsolated

protected static boolean isIsolated(org.apache.hadoop.mapreduce.JobContext job)

getUsername

protected static java.lang.String getUsername(org.apache.hadoop.mapreduce.JobContext job)

getPassword

protected static byte[] getPassword(org.apache.hadoop.mapreduce.JobContext job)
WARNING: The password is stored in the Configuration and shared with all MapReduce tasks; It is BASE64 encoded to provide a charset safe conversion to a string, and is not intended to be secure.


getTablename

protected static java.lang.String getTablename(org.apache.hadoop.mapreduce.JobContext job)

getAuthorizations

protected static Authorizations getAuthorizations(org.apache.hadoop.mapreduce.JobContext job)

getInstance

protected static Instance getInstance(org.apache.hadoop.mapreduce.JobContext job)

getTabletLocator

protected static TabletLocator getTabletLocator(org.apache.hadoop.mapreduce.JobContext job)
                                         throws TableNotFoundException
Throws:
TableNotFoundException

getRanges

protected static java.util.List<Range> getRanges(org.apache.hadoop.mapreduce.JobContext job)
                                          throws java.io.IOException
Throws:
java.io.IOException

getRegex

protected static java.lang.String getRegex(org.apache.hadoop.mapreduce.JobContext job,
                                           AccumuloInputFormat.RegexType type)

getFetchedColumns

protected static java.util.Set<Pair<org.apache.hadoop.io.Text,org.apache.hadoop.io.Text>> getFetchedColumns(org.apache.hadoop.mapreduce.JobContext job)

getAutoAdjustRanges

protected static boolean getAutoAdjustRanges(org.apache.hadoop.mapreduce.JobContext job)

getLogLevel

protected static org.apache.log4j.Level getLogLevel(org.apache.hadoop.mapreduce.JobContext job)

validateOptions

protected static void validateOptions(org.apache.hadoop.mapreduce.JobContext job)
                               throws java.io.IOException
Throws:
java.io.IOException

getMaxVersions

protected static int getMaxVersions(org.apache.hadoop.mapreduce.JobContext job)

getIterators

protected static java.util.List<org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat.AccumuloIterator> getIterators(org.apache.hadoop.mapreduce.JobContext job)

getIteratorOptions

protected static java.util.List<org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat.AccumuloIteratorOption> getIteratorOptions(org.apache.hadoop.mapreduce.JobContext job)

createRecordReader

public org.apache.hadoop.mapreduce.RecordReader<Key,Value> createRecordReader(org.apache.hadoop.mapreduce.InputSplit inSplit,
                                                                              org.apache.hadoop.mapreduce.TaskAttemptContext attempt)
                                                                       throws java.io.IOException,
                                                                              java.lang.InterruptedException
Specified by:
createRecordReader in class org.apache.hadoop.mapreduce.InputFormat<Key,Value>
Throws:
java.io.IOException
java.lang.InterruptedException

getSplits

public java.util.List<org.apache.hadoop.mapreduce.InputSplit> getSplits(org.apache.hadoop.mapreduce.JobContext job)
                                                                 throws java.io.IOException
read the metadata table to get tablets of interest these each become a split

Specified by:
getSplits in class org.apache.hadoop.mapreduce.InputFormat<Key,Value>
Throws:
java.io.IOException


Copyright © 2012 The Apache Software Foundation. All Rights Reserved.