001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitFdResponse.USE_RECEIPT_VERIFICATION;
021
022import java.io.BufferedOutputStream;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.FileInputStream;
026import java.io.IOException;
027import java.net.InetSocketAddress;
028
029import org.apache.commons.lang.mutable.MutableBoolean;
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.StorageType;
035import org.apache.hadoop.hdfs.net.DomainPeer;
036import org.apache.hadoop.hdfs.net.Peer;
037import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
038import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
039import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
040import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
041import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
042import org.apache.hadoop.hdfs.protocolPB.PBHelper;
043import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
044import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
045import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
046import org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory;
047import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache;
048import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.ShortCircuitReplicaCreator;
049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica;
050import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplicaInfo;
051import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
052import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
053import org.apache.hadoop.io.IOUtils;
054import org.apache.hadoop.ipc.RemoteException;
055import org.apache.hadoop.net.unix.DomainSocket;
056import org.apache.hadoop.security.AccessControlException;
057import org.apache.hadoop.security.UserGroupInformation;
058import org.apache.hadoop.security.token.SecretManager.InvalidToken;
059import org.apache.hadoop.security.token.Token;
060import org.apache.hadoop.util.PerformanceAdvisory;
061import org.apache.hadoop.util.Time;
062
063import com.google.common.annotations.VisibleForTesting;
064import com.google.common.base.Preconditions;
065
066
067/** 
068 * Utility class to create BlockReader implementations.
069 */
070@InterfaceAudience.Private
071public class BlockReaderFactory implements ShortCircuitReplicaCreator {
072  static final Log LOG = LogFactory.getLog(BlockReaderFactory.class);
073
074  public static class FailureInjector {
075    public void injectRequestFileDescriptorsFailure() throws IOException {
076      // do nothing
077    }
078  }
079
080  @VisibleForTesting
081  static ShortCircuitReplicaCreator
082      createShortCircuitReplicaInfoCallback = null;
083
084  private final DFSClient.Conf conf;
085
086  /**
087   * Injects failures into specific operations during unit tests.
088   */
089  private final FailureInjector failureInjector;
090
091  /**
092   * The file name, for logging and debugging purposes.
093   */
094  private String fileName;
095
096  /**
097   * The block ID and block pool ID to use.
098   */
099  private ExtendedBlock block;
100
101  /**
102   * The block token to use for security purposes.
103   */
104  private Token<BlockTokenIdentifier> token;
105
106  /**
107   * The offset within the block to start reading at.
108   */
109  private long startOffset;
110
111  /**
112   * If false, we won't try to verify the block checksum.
113   */
114  private boolean verifyChecksum;
115
116  /**
117   * The name of this client.
118   */
119  private String clientName; 
120
121  /**
122   * The DataNode we're talking to.
123   */
124  private DatanodeInfo datanode;
125
126  /**
127   * StorageType of replica on DataNode.
128   */
129  private StorageType storageType;
130
131  /**
132   * If false, we won't try short-circuit local reads.
133   */
134  private boolean allowShortCircuitLocalReads;
135
136  /**
137   * The ClientContext to use for things like the PeerCache.
138   */
139  private ClientContext clientContext;
140
141  /**
142   * Number of bytes to read.  -1 indicates no limit.
143   */
144  private long length = -1;
145
146  /**
147   * Caching strategy to use when reading the block.
148   */
149  private CachingStrategy cachingStrategy;
150
151  /**
152   * Socket address to use to connect to peer.
153   */
154  private InetSocketAddress inetSocketAddress;
155
156  /**
157   * Remote peer factory to use to create a peer, if needed.
158   */
159  private RemotePeerFactory remotePeerFactory;
160
161  /**
162   * UserGroupInformation  to use for legacy block reader local objects, if needed.
163   */
164  private UserGroupInformation userGroupInformation;
165
166  /**
167   * Configuration to use for legacy block reader local objects, if needed.
168   */
169  private Configuration configuration;
170
171  /**
172   * Information about the domain socket path we should use to connect to the
173   * local peer-- or null if we haven't examined the local domain socket.
174   */
175  private DomainSocketFactory.PathInfo pathInfo;
176
177  /**
178   * The remaining number of times that we'll try to pull a socket out of the
179   * cache.
180   */
181  private int remainingCacheTries;
182
183  public BlockReaderFactory(DFSClient.Conf conf) {
184    this.conf = conf;
185    this.failureInjector = conf.brfFailureInjector;
186    this.remainingCacheTries = conf.nCachedConnRetry;
187  }
188
189  public BlockReaderFactory setFileName(String fileName) {
190    this.fileName = fileName;
191    return this;
192  }
193
194  public BlockReaderFactory setBlock(ExtendedBlock block) {
195    this.block = block;
196    return this;
197  }
198
199  public BlockReaderFactory setBlockToken(Token<BlockTokenIdentifier> token) {
200    this.token = token;
201    return this;
202  }
203
204  public BlockReaderFactory setStartOffset(long startOffset) {
205    this.startOffset = startOffset;
206    return this;
207  }
208
209  public BlockReaderFactory setVerifyChecksum(boolean verifyChecksum) {
210    this.verifyChecksum = verifyChecksum;
211    return this;
212  }
213
214  public BlockReaderFactory setClientName(String clientName) {
215    this.clientName = clientName;
216    return this;
217  }
218
219  public BlockReaderFactory setDatanodeInfo(DatanodeInfo datanode) {
220    this.datanode = datanode;
221    return this;
222  }
223
224  public BlockReaderFactory setStorageType(StorageType storageType) {
225    this.storageType = storageType;
226    return this;
227  }
228
229  public BlockReaderFactory setAllowShortCircuitLocalReads(
230      boolean allowShortCircuitLocalReads) {
231    this.allowShortCircuitLocalReads = allowShortCircuitLocalReads;
232    return this;
233  }
234
235  public BlockReaderFactory setClientCacheContext(
236      ClientContext clientContext) {
237    this.clientContext = clientContext;
238    return this;
239  }
240
241  public BlockReaderFactory setLength(long length) {
242    this.length = length;
243    return this;
244  }
245
246  public BlockReaderFactory setCachingStrategy(
247      CachingStrategy cachingStrategy) {
248    this.cachingStrategy = cachingStrategy;
249    return this;
250  }
251
252  public BlockReaderFactory setInetSocketAddress (
253      InetSocketAddress inetSocketAddress) {
254    this.inetSocketAddress = inetSocketAddress;
255    return this;
256  }
257
258  public BlockReaderFactory setUserGroupInformation(
259      UserGroupInformation userGroupInformation) {
260    this.userGroupInformation = userGroupInformation;
261    return this;
262  }
263
264  public BlockReaderFactory setRemotePeerFactory(
265      RemotePeerFactory remotePeerFactory) {
266    this.remotePeerFactory = remotePeerFactory;
267    return this;
268  }
269
270  public BlockReaderFactory setConfiguration(
271      Configuration configuration) {
272    this.configuration = configuration;
273    return this;
274  }
275
276  /**
277   * Build a BlockReader with the given options.
278   *
279   * This function will do the best it can to create a block reader that meets
280   * all of our requirements.  We prefer short-circuit block readers
281   * (BlockReaderLocal and BlockReaderLocalLegacy) over remote ones, since the
282   * former avoid the overhead of socket communication.  If short-circuit is
283   * unavailable, our next fallback is data transfer over UNIX domain sockets,
284   * if dfs.client.domain.socket.data.traffic has been enabled.  If that doesn't
285   * work, we will try to create a remote block reader that operates over TCP
286   * sockets.
287   *
288   * There are a few caches that are important here.
289   *
290   * The ShortCircuitCache stores file descriptor objects which have been passed
291   * from the DataNode. 
292   *
293   * The DomainSocketFactory stores information about UNIX domain socket paths
294   * that we not been able to use in the past, so that we don't waste time
295   * retrying them over and over.  (Like all the caches, it does have a timeout,
296   * though.)
297   *
298   * The PeerCache stores peers that we have used in the past.  If we can reuse
299   * one of these peers, we avoid the overhead of re-opening a socket.  However,
300   * if the socket has been timed out on the remote end, our attempt to reuse
301   * the socket may end with an IOException.  For that reason, we limit our
302   * attempts at socket reuse to dfs.client.cached.conn.retry times.  After
303   * that, we create new sockets.  This avoids the problem where a thread tries
304   * to talk to a peer that it hasn't talked to in a while, and has to clean out
305   * every entry in a socket cache full of stale entries.
306   *
307   * @return The new BlockReader.  We will not return null.
308   *
309   * @throws InvalidToken
310   *             If the block token was invalid.
311   *         InvalidEncryptionKeyException
312   *             If the encryption key was invalid.
313   *         Other IOException
314   *             If there was another problem.
315   */
316  public BlockReader build() throws IOException {
317    BlockReader reader = null;
318
319    Preconditions.checkNotNull(configuration);
320    if (conf.shortCircuitLocalReads && allowShortCircuitLocalReads) {
321      if (clientContext.getUseLegacyBlockReaderLocal()) {
322        reader = getLegacyBlockReaderLocal();
323        if (reader != null) {
324          if (LOG.isTraceEnabled()) {
325            LOG.trace(this + ": returning new legacy block reader local.");
326          }
327          return reader;
328        }
329      } else {
330        reader = getBlockReaderLocal();
331        if (reader != null) {
332          if (LOG.isTraceEnabled()) {
333            LOG.trace(this + ": returning new block reader local.");
334          }
335          return reader;
336        }
337      }
338    }
339    if (conf.domainSocketDataTraffic) {
340      reader = getRemoteBlockReaderFromDomain();
341      if (reader != null) {
342        if (LOG.isTraceEnabled()) {
343          LOG.trace(this + ": returning new remote block reader using " +
344              "UNIX domain socket on " + pathInfo.getPath());
345        }
346        return reader;
347      }
348    }
349    Preconditions.checkState(!DFSInputStream.tcpReadsDisabledForTesting,
350        "TCP reads were disabled for testing, but we failed to " +
351        "do a non-TCP read.");
352    return getRemoteBlockReaderFromTcp();
353  }
354
355  /**
356   * Get {@link BlockReaderLocalLegacy} for short circuited local reads.
357   * This block reader implements the path-based style of local reads
358   * first introduced in HDFS-2246.
359   */
360  private BlockReader getLegacyBlockReaderLocal() throws IOException {
361    if (LOG.isTraceEnabled()) {
362      LOG.trace(this + ": trying to construct BlockReaderLocalLegacy");
363    }
364    if (!DFSClient.isLocalAddress(inetSocketAddress)) {
365      if (LOG.isTraceEnabled()) {
366        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
367            "the address " + inetSocketAddress + " is not local");
368      }
369      return null;
370    }
371    if (clientContext.getDisableLegacyBlockReaderLocal()) {
372      PerformanceAdvisory.LOG.debug(this + ": can't construct " +
373          "BlockReaderLocalLegacy because " +
374          "disableLegacyBlockReaderLocal is set.");
375      return null;
376    }
377    IOException ioe = null;
378    try {
379      return BlockReaderLocalLegacy.newBlockReader(conf,
380          userGroupInformation, configuration, fileName, block, token,
381          datanode, startOffset, length, storageType);
382    } catch (RemoteException remoteException) {
383      ioe = remoteException.unwrapRemoteException(
384                InvalidToken.class, AccessControlException.class);
385    } catch (IOException e) {
386      ioe = e;
387    }
388    if ((!(ioe instanceof AccessControlException)) &&
389        isSecurityException(ioe)) {
390      // Handle security exceptions.
391      // We do not handle AccessControlException here, since
392      // BlockReaderLocalLegacy#newBlockReader uses that exception to indicate
393      // that the user is not in dfs.block.local-path-access.user, a condition
394      // which requires us to disable legacy SCR.
395      throw ioe;
396    }
397    LOG.warn(this + ": error creating legacy BlockReaderLocal.  " +
398        "Disabling legacy local reads.", ioe);
399    clientContext.setDisableLegacyBlockReaderLocal();
400    return null;
401  }
402
403  private BlockReader getBlockReaderLocal() throws InvalidToken {
404    if (LOG.isTraceEnabled()) {
405      LOG.trace(this + ": trying to construct a BlockReaderLocal " +
406          "for short-circuit reads.");
407    }
408    if (pathInfo == null) {
409      pathInfo = clientContext.getDomainSocketFactory().
410                      getPathInfo(inetSocketAddress, conf);
411    }
412    if (!pathInfo.getPathState().getUsableForShortCircuit()) {
413      PerformanceAdvisory.LOG.debug(this + ": " + pathInfo + " is not " +
414          "usable for short circuit; giving up on BlockReaderLocal.");
415      return null;
416    }
417    ShortCircuitCache cache = clientContext.getShortCircuitCache();
418    ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
419    ShortCircuitReplicaInfo info = cache.fetchOrCreate(key, this);
420    InvalidToken exc = info.getInvalidTokenException();
421    if (exc != null) {
422      if (LOG.isTraceEnabled()) {
423        LOG.trace(this + ": got InvalidToken exception while trying to " +
424            "construct BlockReaderLocal via " + pathInfo.getPath());
425      }
426      throw exc;
427    }
428    if (info.getReplica() == null) {
429      if (LOG.isTraceEnabled()) {
430        PerformanceAdvisory.LOG.debug(this + ": failed to get " +
431            "ShortCircuitReplica. Cannot construct " +
432            "BlockReaderLocal via " + pathInfo.getPath());
433      }
434      return null;
435    }
436    return new BlockReaderLocal.Builder(conf).
437        setFilename(fileName).
438        setBlock(block).
439        setStartOffset(startOffset).
440        setShortCircuitReplica(info.getReplica()).
441        setVerifyChecksum(verifyChecksum).
442        setCachingStrategy(cachingStrategy).
443        setStorageType(storageType).
444        build();
445  }
446
447  /**
448   * Fetch a pair of short-circuit block descriptors from a local DataNode.
449   *
450   * @return    Null if we could not communicate with the datanode,
451   *            a new ShortCircuitReplicaInfo object otherwise.
452   *            ShortCircuitReplicaInfo objects may contain either an InvalidToken
453   *            exception, or a ShortCircuitReplica object ready to use.
454   */
455  @Override
456  public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
457    if (createShortCircuitReplicaInfoCallback != null) {
458      ShortCircuitReplicaInfo info =
459        createShortCircuitReplicaInfoCallback.createShortCircuitReplicaInfo();
460      if (info != null) return info;
461    }
462    if (LOG.isTraceEnabled()) {
463      LOG.trace(this + ": trying to create ShortCircuitReplicaInfo.");
464    }
465    BlockReaderPeer curPeer;
466    while (true) {
467      curPeer = nextDomainPeer();
468      if (curPeer == null) break;
469      if (curPeer.fromCache) remainingCacheTries--;
470      DomainPeer peer = (DomainPeer)curPeer.peer;
471      Slot slot = null;
472      ShortCircuitCache cache = clientContext.getShortCircuitCache();
473      try {
474        MutableBoolean usedPeer = new MutableBoolean(false);
475        slot = cache.allocShmSlot(datanode, peer, usedPeer,
476            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId()),
477            clientName);
478        if (usedPeer.booleanValue()) {
479          if (LOG.isTraceEnabled()) {
480            LOG.trace(this + ": allocShmSlot used up our previous socket " +
481              peer.getDomainSocket() + ".  Allocating a new one...");
482          }
483          curPeer = nextDomainPeer();
484          if (curPeer == null) break;
485          peer = (DomainPeer)curPeer.peer;
486        }
487        ShortCircuitReplicaInfo info = requestFileDescriptors(peer, slot);
488        clientContext.getPeerCache().put(datanode, peer);
489        return info;
490      } catch (IOException e) {
491        if (slot != null) {
492          cache.freeSlot(slot);
493        }
494        if (curPeer.fromCache) {
495          // Handle an I/O error we got when using a cached socket.
496          // These are considered less serious, because the socket may be stale.
497          if (LOG.isDebugEnabled()) {
498            LOG.debug(this + ": closing stale domain peer " + peer, e);
499          }
500          IOUtils.cleanup(LOG, peer);
501        } else {
502          // Handle an I/O error we got when using a newly created socket.
503          // We temporarily disable the domain socket path for a few minutes in
504          // this case, to prevent wasting more time on it.
505          LOG.warn(this + ": I/O error requesting file descriptors.  " + 
506              "Disabling domain socket " + peer.getDomainSocket(), e);
507          IOUtils.cleanup(LOG, peer);
508          clientContext.getDomainSocketFactory()
509              .disableDomainSocketPath(pathInfo.getPath());
510          return null;
511        }
512      }
513    }
514    return null;
515  }
516
517  /**
518   * Request file descriptors from a DomainPeer.
519   *
520   * @param peer   The peer to use for communication.
521   * @param slot   If non-null, the shared memory slot to associate with the 
522   *               new ShortCircuitReplica.
523   * 
524   * @return  A ShortCircuitReplica object if we could communicate with the
525   *          datanode; null, otherwise. 
526   * @throws  IOException If we encountered an I/O exception while communicating
527   *          with the datanode.
528   */
529  private ShortCircuitReplicaInfo requestFileDescriptors(DomainPeer peer,
530          Slot slot) throws IOException {
531    ShortCircuitCache cache = clientContext.getShortCircuitCache();
532    final DataOutputStream out =
533        new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
534    SlotId slotId = slot == null ? null : slot.getSlotId();
535    new Sender(out).requestShortCircuitFds(block, token, slotId, 1, true);
536    DataInputStream in = new DataInputStream(peer.getInputStream());
537    BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
538        PBHelper.vintPrefixed(in));
539    DomainSocket sock = peer.getDomainSocket();
540    failureInjector.injectRequestFileDescriptorsFailure();
541    switch (resp.getStatus()) {
542    case SUCCESS:
543      byte buf[] = new byte[1];
544      FileInputStream fis[] = new FileInputStream[2];
545      sock.recvFileInputStreams(fis, buf, 0, buf.length);
546      ShortCircuitReplica replica = null;
547      try {
548        ExtendedBlockId key =
549            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
550        if (buf[0] == USE_RECEIPT_VERIFICATION.getNumber()) {
551          LOG.trace("Sending receipt verification byte for slot " + slot);
552          sock.getOutputStream().write(0);
553        }
554        replica = new ShortCircuitReplica(key, fis[0], fis[1], cache,
555            Time.monotonicNow(), slot);
556        return new ShortCircuitReplicaInfo(replica);
557      } catch (IOException e) {
558        // This indicates an error reading from disk, or a format error.  Since
559        // it's not a socket communication problem, we return null rather than
560        // throwing an exception.
561        LOG.warn(this + ": error creating ShortCircuitReplica.", e);
562        return null;
563      } finally {
564        if (replica == null) {
565          IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
566        }
567      }
568    case ERROR_UNSUPPORTED:
569      if (!resp.hasShortCircuitAccessVersion()) {
570        LOG.warn("short-circuit read access is disabled for " +
571            "DataNode " + datanode + ".  reason: " + resp.getMessage());
572        clientContext.getDomainSocketFactory()
573            .disableShortCircuitForPath(pathInfo.getPath());
574      } else {
575        LOG.warn("short-circuit read access for the file " +
576            fileName + " is disabled for DataNode " + datanode +
577            ".  reason: " + resp.getMessage());
578      }
579      return null;
580    case ERROR_ACCESS_TOKEN:
581      String msg = "access control error while " +
582          "attempting to set up short-circuit access to " +
583          fileName + resp.getMessage();
584      if (LOG.isDebugEnabled()) {
585        LOG.debug(this + ":" + msg);
586      }
587      return new ShortCircuitReplicaInfo(new InvalidToken(msg));
588    default:
589      LOG.warn(this + ": unknown response code " + resp.getStatus() +
590          " while attempting to set up short-circuit access. " +
591          resp.getMessage());
592      clientContext.getDomainSocketFactory()
593          .disableShortCircuitForPath(pathInfo.getPath());
594      return null;
595    }
596  }
597
598  /**
599   * Get a RemoteBlockReader that communicates over a UNIX domain socket.
600   *
601   * @return The new BlockReader, or null if we failed to create the block
602   * reader.
603   *
604   * @throws InvalidToken    If the block token was invalid.
605   * Potentially other security-related execptions.
606   */
607  private BlockReader getRemoteBlockReaderFromDomain() throws IOException {
608    if (pathInfo == null) {
609      pathInfo = clientContext.getDomainSocketFactory().
610                      getPathInfo(inetSocketAddress, conf);
611    }
612    if (!pathInfo.getPathState().getUsableForDataTransfer()) {
613      PerformanceAdvisory.LOG.debug(this + ": not trying to create a " +
614          "remote block reader because the UNIX domain socket at " +
615          pathInfo + " is not usable.");
616      return null;
617    }
618    if (LOG.isTraceEnabled()) {
619      LOG.trace(this + ": trying to create a remote block reader from the " +
620          "UNIX domain socket at " + pathInfo.getPath());
621    }
622
623    while (true) {
624      BlockReaderPeer curPeer = nextDomainPeer();
625      if (curPeer == null) break;
626      if (curPeer.fromCache) remainingCacheTries--;
627      DomainPeer peer = (DomainPeer)curPeer.peer;
628      BlockReader blockReader = null;
629      try {
630        blockReader = getRemoteBlockReader(peer);
631        return blockReader;
632      } catch (IOException ioe) {
633        IOUtils.cleanup(LOG, peer);
634        if (isSecurityException(ioe)) {
635          if (LOG.isTraceEnabled()) {
636            LOG.trace(this + ": got security exception while constructing " +
637                "a remote block reader from the unix domain socket at " +
638                pathInfo.getPath(), ioe);
639          }
640          throw ioe;
641        }
642        if (curPeer.fromCache) {
643          // Handle an I/O error we got when using a cached peer.  These are
644          // considered less serious, because the underlying socket may be stale.
645          if (LOG.isDebugEnabled()) {
646            LOG.debug("Closed potentially stale domain peer " + peer, ioe);
647          }
648        } else {
649          // Handle an I/O error we got when using a newly created domain peer.
650          // We temporarily disable the domain socket path for a few minutes in
651          // this case, to prevent wasting more time on it.
652          LOG.warn("I/O error constructing remote block reader.  Disabling " +
653              "domain socket " + peer.getDomainSocket(), ioe);
654          clientContext.getDomainSocketFactory()
655              .disableDomainSocketPath(pathInfo.getPath());
656          return null;
657        }
658      } finally {
659        if (blockReader == null) {
660          IOUtils.cleanup(LOG, peer);
661        }
662      }
663    }
664    return null;
665  }
666
667  /**
668   * Get a RemoteBlockReader that communicates over a TCP socket.
669   *
670   * @return The new BlockReader.  We will not return null, but instead throw
671   *         an exception if this fails.
672   *
673   * @throws InvalidToken
674   *             If the block token was invalid.
675   *         InvalidEncryptionKeyException
676   *             If the encryption key was invalid.
677   *         Other IOException
678   *             If there was another problem.
679   */
680  private BlockReader getRemoteBlockReaderFromTcp() throws IOException {
681    if (LOG.isTraceEnabled()) {
682      LOG.trace(this + ": trying to create a remote block reader from a " +
683          "TCP socket");
684    }
685    BlockReader blockReader = null;
686    while (true) {
687      BlockReaderPeer curPeer = null;
688      Peer peer = null;
689      try {
690        curPeer = nextTcpPeer();
691        if (curPeer.fromCache) remainingCacheTries--;
692        peer = curPeer.peer;
693        blockReader = getRemoteBlockReader(peer);
694        return blockReader;
695      } catch (IOException ioe) {
696        if (isSecurityException(ioe)) {
697          if (LOG.isTraceEnabled()) {
698            LOG.trace(this + ": got security exception while constructing " +
699                "a remote block reader from " + peer, ioe);
700          }
701          throw ioe;
702        }
703        if ((curPeer != null) && curPeer.fromCache) {
704          // Handle an I/O error we got when using a cached peer.  These are
705          // considered less serious, because the underlying socket may be
706          // stale.
707          if (LOG.isDebugEnabled()) {
708            LOG.debug("Closed potentially stale remote peer " + peer, ioe);
709          }
710        } else {
711          // Handle an I/O error we got when using a newly created peer.
712          LOG.warn("I/O error constructing remote block reader.", ioe);
713          throw ioe;
714        }
715      } finally {
716        if (blockReader == null) {
717          IOUtils.cleanup(LOG, peer);
718        }
719      }
720    }
721  }
722
723  public static class BlockReaderPeer {
724    final Peer peer;
725    final boolean fromCache;
726    
727    BlockReaderPeer(Peer peer, boolean fromCache) {
728      this.peer = peer;
729      this.fromCache = fromCache;
730    }
731  }
732
733  /**
734   * Get the next DomainPeer-- either from the cache or by creating it.
735   *
736   * @return the next DomainPeer, or null if we could not construct one.
737   */
738  private BlockReaderPeer nextDomainPeer() {
739    if (remainingCacheTries > 0) {
740      Peer peer = clientContext.getPeerCache().get(datanode, true);
741      if (peer != null) {
742        if (LOG.isTraceEnabled()) {
743          LOG.trace("nextDomainPeer: reusing existing peer " + peer);
744        }
745        return new BlockReaderPeer(peer, true);
746      }
747    }
748    DomainSocket sock = clientContext.getDomainSocketFactory().
749        createSocket(pathInfo, conf.socketTimeout);
750    if (sock == null) return null;
751    return new BlockReaderPeer(new DomainPeer(sock), false);
752  }
753
754  /**
755   * Get the next TCP-based peer-- either from the cache or by creating it.
756   *
757   * @return the next Peer, or null if we could not construct one.
758   *
759   * @throws IOException  If there was an error while constructing the peer
760   *                      (such as an InvalidEncryptionKeyException)
761   */
762  private BlockReaderPeer nextTcpPeer() throws IOException {
763    if (remainingCacheTries > 0) {
764      Peer peer = clientContext.getPeerCache().get(datanode, false);
765      if (peer != null) {
766        if (LOG.isTraceEnabled()) {
767          LOG.trace("nextTcpPeer: reusing existing peer " + peer);
768        }
769        return new BlockReaderPeer(peer, true);
770      }
771    }
772    try {
773      Peer peer = remotePeerFactory.newConnectedPeer(inetSocketAddress, token,
774        datanode);
775      if (LOG.isTraceEnabled()) {
776        LOG.trace("nextTcpPeer: created newConnectedPeer " + peer);
777      }
778      return new BlockReaderPeer(peer, false);
779    } catch (IOException e) {
780      if (LOG.isTraceEnabled()) {
781        LOG.trace("nextTcpPeer: failed to create newConnectedPeer " +
782                  "connected to " + datanode);
783      }
784      throw e;
785    }
786  }
787
788  /**
789   * Determine if an exception is security-related.
790   *
791   * We need to handle these exceptions differently than other IOExceptions.
792   * They don't indicate a communication problem.  Instead, they mean that there
793   * is some action the client needs to take, such as refetching block tokens,
794   * renewing encryption keys, etc.
795   *
796   * @param ioe    The exception
797   * @return       True only if the exception is security-related.
798   */
799  private static boolean isSecurityException(IOException ioe) {
800    return (ioe instanceof InvalidToken) ||
801            (ioe instanceof InvalidEncryptionKeyException) ||
802            (ioe instanceof InvalidBlockTokenException) ||
803            (ioe instanceof AccessControlException);
804  }
805
806  @SuppressWarnings("deprecation")
807  private BlockReader getRemoteBlockReader(Peer peer) throws IOException {
808    if (conf.useLegacyBlockReader) {
809      return RemoteBlockReader.newBlockReader(fileName,
810          block, token, startOffset, length, conf.ioBufferSize,
811          verifyChecksum, clientName, peer, datanode,
812          clientContext.getPeerCache(), cachingStrategy);
813    } else {
814      return RemoteBlockReader2.newBlockReader(
815          fileName, block, token, startOffset, length,
816          verifyChecksum, clientName, peer, datanode,
817          clientContext.getPeerCache(), cachingStrategy);
818    }
819  }
820
821  @Override
822  public String toString() {
823    return "BlockReaderFactory(fileName=" + fileName + ", block=" + block + ")";
824  }
825
826  /**
827   * File name to print when accessing a block directly (from servlets)
828   * @param s Address of the block location
829   * @param poolId Block pool ID of the block
830   * @param blockId Block ID of the block
831   * @return string that has a file name for debug purposes
832   */
833  public static String getFileName(final InetSocketAddress s,
834      final String poolId, final long blockId) {
835    return s.toString() + ":" + poolId + ":" + blockId;
836  }
837}