001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs;
019
020import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitFdResponse.USE_RECEIPT_VERIFICATION;
021
022import java.io.BufferedOutputStream;
023import java.io.DataInputStream;
024import java.io.DataOutputStream;
025import java.io.FileInputStream;
026import java.io.IOException;
027import java.net.InetSocketAddress;
028
029import org.apache.commons.lang.mutable.MutableBoolean;
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.apache.hadoop.classification.InterfaceAudience;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.hdfs.net.DomainPeer;
035import org.apache.hadoop.hdfs.net.Peer;
036import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
037import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
038import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
039import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
040import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
041import org.apache.hadoop.hdfs.protocolPB.PBHelper;
042import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
043import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
044import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
045import org.apache.hadoop.hdfs.shortcircuit.DomainSocketFactory;
046import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache;
047import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitCache.ShortCircuitReplicaCreator;
048import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplica;
049import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitReplicaInfo;
050import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.Slot;
051import org.apache.hadoop.hdfs.shortcircuit.ShortCircuitShm.SlotId;
052import org.apache.hadoop.io.IOUtils;
053import org.apache.hadoop.ipc.RemoteException;
054import org.apache.hadoop.net.unix.DomainSocket;
055import org.apache.hadoop.security.AccessControlException;
056import org.apache.hadoop.security.UserGroupInformation;
057import org.apache.hadoop.security.token.SecretManager.InvalidToken;
058import org.apache.hadoop.security.token.Token;
059import org.apache.hadoop.util.PerformanceAdvisory;
060import org.apache.hadoop.util.Time;
061
062import com.google.common.annotations.VisibleForTesting;
063import com.google.common.base.Preconditions;
064
065
066/** 
067 * Utility class to create BlockReader implementations.
068 */
069@InterfaceAudience.Private
070public class BlockReaderFactory implements ShortCircuitReplicaCreator {
071  static final Log LOG = LogFactory.getLog(BlockReaderFactory.class);
072
073  public static class FailureInjector {
074    public void injectRequestFileDescriptorsFailure() throws IOException {
075      // do nothing
076    }
077    public boolean getSupportsReceiptVerification() {
078      return true;
079    }
080  }
081
082  @VisibleForTesting
083  static ShortCircuitReplicaCreator
084      createShortCircuitReplicaInfoCallback = null;
085
086  private final DFSClient.Conf conf;
087
088  /**
089   * Injects failures into specific operations during unit tests.
090   */
091  private final FailureInjector failureInjector;
092
093  /**
094   * The file name, for logging and debugging purposes.
095   */
096  private String fileName;
097
098  /**
099   * The block ID and block pool ID to use.
100   */
101  private ExtendedBlock block;
102
103  /**
104   * The block token to use for security purposes.
105   */
106  private Token<BlockTokenIdentifier> token;
107
108  /**
109   * The offset within the block to start reading at.
110   */
111  private long startOffset;
112
113  /**
114   * If false, we won't try to verify the block checksum.
115   */
116  private boolean verifyChecksum;
117
118  /**
119   * The name of this client.
120   */
121  private String clientName; 
122
123  /**
124   * The DataNode we're talking to.
125   */
126  private DatanodeInfo datanode;
127
128  /**
129   * StorageType of replica on DataNode.
130   */
131  private StorageType storageType;
132
133  /**
134   * If false, we won't try short-circuit local reads.
135   */
136  private boolean allowShortCircuitLocalReads;
137
138  /**
139   * The ClientContext to use for things like the PeerCache.
140   */
141  private ClientContext clientContext;
142
143  /**
144   * Number of bytes to read.  -1 indicates no limit.
145   */
146  private long length = -1;
147
148  /**
149   * Caching strategy to use when reading the block.
150   */
151  private CachingStrategy cachingStrategy;
152
153  /**
154   * Socket address to use to connect to peer.
155   */
156  private InetSocketAddress inetSocketAddress;
157
158  /**
159   * Remote peer factory to use to create a peer, if needed.
160   */
161  private RemotePeerFactory remotePeerFactory;
162
163  /**
164   * UserGroupInformation  to use for legacy block reader local objects, if needed.
165   */
166  private UserGroupInformation userGroupInformation;
167
168  /**
169   * Configuration to use for legacy block reader local objects, if needed.
170   */
171  private Configuration configuration;
172
173  /**
174   * Information about the domain socket path we should use to connect to the
175   * local peer-- or null if we haven't examined the local domain socket.
176   */
177  private DomainSocketFactory.PathInfo pathInfo;
178
179  /**
180   * The remaining number of times that we'll try to pull a socket out of the
181   * cache.
182   */
183  private int remainingCacheTries;
184
185  public BlockReaderFactory(DFSClient.Conf conf) {
186    this.conf = conf;
187    this.failureInjector = conf.brfFailureInjector;
188    this.remainingCacheTries = conf.nCachedConnRetry;
189  }
190
191  public BlockReaderFactory setFileName(String fileName) {
192    this.fileName = fileName;
193    return this;
194  }
195
196  public BlockReaderFactory setBlock(ExtendedBlock block) {
197    this.block = block;
198    return this;
199  }
200
201  public BlockReaderFactory setBlockToken(Token<BlockTokenIdentifier> token) {
202    this.token = token;
203    return this;
204  }
205
206  public BlockReaderFactory setStartOffset(long startOffset) {
207    this.startOffset = startOffset;
208    return this;
209  }
210
211  public BlockReaderFactory setVerifyChecksum(boolean verifyChecksum) {
212    this.verifyChecksum = verifyChecksum;
213    return this;
214  }
215
216  public BlockReaderFactory setClientName(String clientName) {
217    this.clientName = clientName;
218    return this;
219  }
220
221  public BlockReaderFactory setDatanodeInfo(DatanodeInfo datanode) {
222    this.datanode = datanode;
223    return this;
224  }
225
226  public BlockReaderFactory setStorageType(StorageType storageType) {
227    this.storageType = storageType;
228    return this;
229  }
230
231  public BlockReaderFactory setAllowShortCircuitLocalReads(
232      boolean allowShortCircuitLocalReads) {
233    this.allowShortCircuitLocalReads = allowShortCircuitLocalReads;
234    return this;
235  }
236
237  public BlockReaderFactory setClientCacheContext(
238      ClientContext clientContext) {
239    this.clientContext = clientContext;
240    return this;
241  }
242
243  public BlockReaderFactory setLength(long length) {
244    this.length = length;
245    return this;
246  }
247
248  public BlockReaderFactory setCachingStrategy(
249      CachingStrategy cachingStrategy) {
250    this.cachingStrategy = cachingStrategy;
251    return this;
252  }
253
254  public BlockReaderFactory setInetSocketAddress (
255      InetSocketAddress inetSocketAddress) {
256    this.inetSocketAddress = inetSocketAddress;
257    return this;
258  }
259
260  public BlockReaderFactory setUserGroupInformation(
261      UserGroupInformation userGroupInformation) {
262    this.userGroupInformation = userGroupInformation;
263    return this;
264  }
265
266  public BlockReaderFactory setRemotePeerFactory(
267      RemotePeerFactory remotePeerFactory) {
268    this.remotePeerFactory = remotePeerFactory;
269    return this;
270  }
271
272  public BlockReaderFactory setConfiguration(
273      Configuration configuration) {
274    this.configuration = configuration;
275    return this;
276  }
277
278  /**
279   * Build a BlockReader with the given options.
280   *
281   * This function will do the best it can to create a block reader that meets
282   * all of our requirements.  We prefer short-circuit block readers
283   * (BlockReaderLocal and BlockReaderLocalLegacy) over remote ones, since the
284   * former avoid the overhead of socket communication.  If short-circuit is
285   * unavailable, our next fallback is data transfer over UNIX domain sockets,
286   * if dfs.client.domain.socket.data.traffic has been enabled.  If that doesn't
287   * work, we will try to create a remote block reader that operates over TCP
288   * sockets.
289   *
290   * There are a few caches that are important here.
291   *
292   * The ShortCircuitCache stores file descriptor objects which have been passed
293   * from the DataNode. 
294   *
295   * The DomainSocketFactory stores information about UNIX domain socket paths
296   * that we not been able to use in the past, so that we don't waste time
297   * retrying them over and over.  (Like all the caches, it does have a timeout,
298   * though.)
299   *
300   * The PeerCache stores peers that we have used in the past.  If we can reuse
301   * one of these peers, we avoid the overhead of re-opening a socket.  However,
302   * if the socket has been timed out on the remote end, our attempt to reuse
303   * the socket may end with an IOException.  For that reason, we limit our
304   * attempts at socket reuse to dfs.client.cached.conn.retry times.  After
305   * that, we create new sockets.  This avoids the problem where a thread tries
306   * to talk to a peer that it hasn't talked to in a while, and has to clean out
307   * every entry in a socket cache full of stale entries.
308   *
309   * @return The new BlockReader.  We will not return null.
310   *
311   * @throws InvalidToken
312   *             If the block token was invalid.
313   *         InvalidEncryptionKeyException
314   *             If the encryption key was invalid.
315   *         Other IOException
316   *             If there was another problem.
317   */
318  public BlockReader build() throws IOException {
319    BlockReader reader = null;
320
321    Preconditions.checkNotNull(configuration);
322    if (conf.shortCircuitLocalReads && allowShortCircuitLocalReads) {
323      if (clientContext.getUseLegacyBlockReaderLocal()) {
324        reader = getLegacyBlockReaderLocal();
325        if (reader != null) {
326          if (LOG.isTraceEnabled()) {
327            LOG.trace(this + ": returning new legacy block reader local.");
328          }
329          return reader;
330        }
331      } else {
332        reader = getBlockReaderLocal();
333        if (reader != null) {
334          if (LOG.isTraceEnabled()) {
335            LOG.trace(this + ": returning new block reader local.");
336          }
337          return reader;
338        }
339      }
340    }
341    if (conf.domainSocketDataTraffic) {
342      reader = getRemoteBlockReaderFromDomain();
343      if (reader != null) {
344        if (LOG.isTraceEnabled()) {
345          LOG.trace(this + ": returning new remote block reader using " +
346              "UNIX domain socket on " + pathInfo.getPath());
347        }
348        return reader;
349      }
350    }
351    Preconditions.checkState(!DFSInputStream.tcpReadsDisabledForTesting,
352        "TCP reads were disabled for testing, but we failed to " +
353        "do a non-TCP read.");
354    return getRemoteBlockReaderFromTcp();
355  }
356
357  /**
358   * Get {@link BlockReaderLocalLegacy} for short circuited local reads.
359   * This block reader implements the path-based style of local reads
360   * first introduced in HDFS-2246.
361   */
362  private BlockReader getLegacyBlockReaderLocal() throws IOException {
363    if (LOG.isTraceEnabled()) {
364      LOG.trace(this + ": trying to construct BlockReaderLocalLegacy");
365    }
366    if (!DFSClient.isLocalAddress(inetSocketAddress)) {
367      if (LOG.isTraceEnabled()) {
368        LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
369            "the address " + inetSocketAddress + " is not local");
370      }
371      return null;
372    }
373    if (clientContext.getDisableLegacyBlockReaderLocal()) {
374      PerformanceAdvisory.LOG.debug(this + ": can't construct " +
375          "BlockReaderLocalLegacy because " +
376          "disableLegacyBlockReaderLocal is set.");
377      return null;
378    }
379    IOException ioe = null;
380    try {
381      return BlockReaderLocalLegacy.newBlockReader(conf,
382          userGroupInformation, configuration, fileName, block, token,
383          datanode, startOffset, length, storageType);
384    } catch (RemoteException remoteException) {
385      ioe = remoteException.unwrapRemoteException(
386                InvalidToken.class, AccessControlException.class);
387    } catch (IOException e) {
388      ioe = e;
389    }
390    if ((!(ioe instanceof AccessControlException)) &&
391        isSecurityException(ioe)) {
392      // Handle security exceptions.
393      // We do not handle AccessControlException here, since
394      // BlockReaderLocalLegacy#newBlockReader uses that exception to indicate
395      // that the user is not in dfs.block.local-path-access.user, a condition
396      // which requires us to disable legacy SCR.
397      throw ioe;
398    }
399    LOG.warn(this + ": error creating legacy BlockReaderLocal.  " +
400        "Disabling legacy local reads.", ioe);
401    clientContext.setDisableLegacyBlockReaderLocal();
402    return null;
403  }
404
405  private BlockReader getBlockReaderLocal() throws InvalidToken {
406    if (LOG.isTraceEnabled()) {
407      LOG.trace(this + ": trying to construct a BlockReaderLocal " +
408          "for short-circuit reads.");
409    }
410    if (pathInfo == null) {
411      pathInfo = clientContext.getDomainSocketFactory().
412                      getPathInfo(inetSocketAddress, conf);
413    }
414    if (!pathInfo.getPathState().getUsableForShortCircuit()) {
415      PerformanceAdvisory.LOG.debug(this + ": " + pathInfo + " is not " +
416          "usable for short circuit; giving up on BlockReaderLocal.");
417      return null;
418    }
419    ShortCircuitCache cache = clientContext.getShortCircuitCache();
420    ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
421    ShortCircuitReplicaInfo info = cache.fetchOrCreate(key, this);
422    InvalidToken exc = info.getInvalidTokenException();
423    if (exc != null) {
424      if (LOG.isTraceEnabled()) {
425        LOG.trace(this + ": got InvalidToken exception while trying to " +
426            "construct BlockReaderLocal via " + pathInfo.getPath());
427      }
428      throw exc;
429    }
430    if (info.getReplica() == null) {
431      if (LOG.isTraceEnabled()) {
432        PerformanceAdvisory.LOG.debug(this + ": failed to get " +
433            "ShortCircuitReplica. Cannot construct " +
434            "BlockReaderLocal via " + pathInfo.getPath());
435      }
436      return null;
437    }
438    return new BlockReaderLocal.Builder(conf).
439        setFilename(fileName).
440        setBlock(block).
441        setStartOffset(startOffset).
442        setShortCircuitReplica(info.getReplica()).
443        setVerifyChecksum(verifyChecksum).
444        setCachingStrategy(cachingStrategy).
445        setStorageType(storageType).
446        build();
447  }
448
449  /**
450   * Fetch a pair of short-circuit block descriptors from a local DataNode.
451   *
452   * @return    Null if we could not communicate with the datanode,
453   *            a new ShortCircuitReplicaInfo object otherwise.
454   *            ShortCircuitReplicaInfo objects may contain either an InvalidToken
455   *            exception, or a ShortCircuitReplica object ready to use.
456   */
457  @Override
458  public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
459    if (createShortCircuitReplicaInfoCallback != null) {
460      ShortCircuitReplicaInfo info =
461        createShortCircuitReplicaInfoCallback.createShortCircuitReplicaInfo();
462      if (info != null) return info;
463    }
464    if (LOG.isTraceEnabled()) {
465      LOG.trace(this + ": trying to create ShortCircuitReplicaInfo.");
466    }
467    BlockReaderPeer curPeer;
468    while (true) {
469      curPeer = nextDomainPeer();
470      if (curPeer == null) break;
471      if (curPeer.fromCache) remainingCacheTries--;
472      DomainPeer peer = (DomainPeer)curPeer.peer;
473      Slot slot = null;
474      ShortCircuitCache cache = clientContext.getShortCircuitCache();
475      try {
476        MutableBoolean usedPeer = new MutableBoolean(false);
477        slot = cache.allocShmSlot(datanode, peer, usedPeer,
478            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId()),
479            clientName);
480        if (usedPeer.booleanValue()) {
481          if (LOG.isTraceEnabled()) {
482            LOG.trace(this + ": allocShmSlot used up our previous socket " +
483              peer.getDomainSocket() + ".  Allocating a new one...");
484          }
485          curPeer = nextDomainPeer();
486          if (curPeer == null) break;
487          peer = (DomainPeer)curPeer.peer;
488        }
489        ShortCircuitReplicaInfo info = requestFileDescriptors(peer, slot);
490        clientContext.getPeerCache().put(datanode, peer);
491        return info;
492      } catch (IOException e) {
493        if (slot != null) {
494          cache.freeSlot(slot);
495        }
496        if (curPeer.fromCache) {
497          // Handle an I/O error we got when using a cached socket.
498          // These are considered less serious, because the socket may be stale.
499          if (LOG.isDebugEnabled()) {
500            LOG.debug(this + ": closing stale domain peer " + peer, e);
501          }
502          IOUtils.cleanup(LOG, peer);
503        } else {
504          // Handle an I/O error we got when using a newly created socket.
505          // We temporarily disable the domain socket path for a few minutes in
506          // this case, to prevent wasting more time on it.
507          LOG.warn(this + ": I/O error requesting file descriptors.  " + 
508              "Disabling domain socket " + peer.getDomainSocket(), e);
509          IOUtils.cleanup(LOG, peer);
510          clientContext.getDomainSocketFactory()
511              .disableDomainSocketPath(pathInfo.getPath());
512          return null;
513        }
514      }
515    }
516    return null;
517  }
518
519  /**
520   * Request file descriptors from a DomainPeer.
521   *
522   * @param peer   The peer to use for communication.
523   * @param slot   If non-null, the shared memory slot to associate with the 
524   *               new ShortCircuitReplica.
525   * 
526   * @return  A ShortCircuitReplica object if we could communicate with the
527   *          datanode; null, otherwise. 
528   * @throws  IOException If we encountered an I/O exception while communicating
529   *          with the datanode.
530   */
531  private ShortCircuitReplicaInfo requestFileDescriptors(DomainPeer peer,
532          Slot slot) throws IOException {
533    ShortCircuitCache cache = clientContext.getShortCircuitCache();
534    final DataOutputStream out =
535        new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
536    SlotId slotId = slot == null ? null : slot.getSlotId();
537    new Sender(out).requestShortCircuitFds(block, token, slotId, 1,
538        failureInjector.getSupportsReceiptVerification());
539    DataInputStream in = new DataInputStream(peer.getInputStream());
540    BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
541        PBHelper.vintPrefixed(in));
542    DomainSocket sock = peer.getDomainSocket();
543    failureInjector.injectRequestFileDescriptorsFailure();
544    switch (resp.getStatus()) {
545    case SUCCESS:
546      byte buf[] = new byte[1];
547      FileInputStream fis[] = new FileInputStream[2];
548      sock.recvFileInputStreams(fis, buf, 0, buf.length);
549      ShortCircuitReplica replica = null;
550      try {
551        ExtendedBlockId key =
552            new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
553        if (buf[0] == USE_RECEIPT_VERIFICATION.getNumber()) {
554          LOG.trace("Sending receipt verification byte for slot " + slot);
555          sock.getOutputStream().write(0);
556        }
557        replica = new ShortCircuitReplica(key, fis[0], fis[1], cache,
558            Time.monotonicNow(), slot);
559        return new ShortCircuitReplicaInfo(replica);
560      } catch (IOException e) {
561        // This indicates an error reading from disk, or a format error.  Since
562        // it's not a socket communication problem, we return null rather than
563        // throwing an exception.
564        LOG.warn(this + ": error creating ShortCircuitReplica.", e);
565        return null;
566      } finally {
567        if (replica == null) {
568          IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
569        }
570      }
571    case ERROR_UNSUPPORTED:
572      if (!resp.hasShortCircuitAccessVersion()) {
573        LOG.warn("short-circuit read access is disabled for " +
574            "DataNode " + datanode + ".  reason: " + resp.getMessage());
575        clientContext.getDomainSocketFactory()
576            .disableShortCircuitForPath(pathInfo.getPath());
577      } else {
578        LOG.warn("short-circuit read access for the file " +
579            fileName + " is disabled for DataNode " + datanode +
580            ".  reason: " + resp.getMessage());
581      }
582      return null;
583    case ERROR_ACCESS_TOKEN:
584      String msg = "access control error while " +
585          "attempting to set up short-circuit access to " +
586          fileName + resp.getMessage();
587      if (LOG.isDebugEnabled()) {
588        LOG.debug(this + ":" + msg);
589      }
590      return new ShortCircuitReplicaInfo(new InvalidToken(msg));
591    default:
592      LOG.warn(this + ": unknown response code " + resp.getStatus() +
593          " while attempting to set up short-circuit access. " +
594          resp.getMessage());
595      clientContext.getDomainSocketFactory()
596          .disableShortCircuitForPath(pathInfo.getPath());
597      return null;
598    }
599  }
600
601  /**
602   * Get a RemoteBlockReader that communicates over a UNIX domain socket.
603   *
604   * @return The new BlockReader, or null if we failed to create the block
605   * reader.
606   *
607   * @throws InvalidToken    If the block token was invalid.
608   * Potentially other security-related execptions.
609   */
610  private BlockReader getRemoteBlockReaderFromDomain() throws IOException {
611    if (pathInfo == null) {
612      pathInfo = clientContext.getDomainSocketFactory().
613                      getPathInfo(inetSocketAddress, conf);
614    }
615    if (!pathInfo.getPathState().getUsableForDataTransfer()) {
616      PerformanceAdvisory.LOG.debug(this + ": not trying to create a " +
617          "remote block reader because the UNIX domain socket at " +
618          pathInfo + " is not usable.");
619      return null;
620    }
621    if (LOG.isTraceEnabled()) {
622      LOG.trace(this + ": trying to create a remote block reader from the " +
623          "UNIX domain socket at " + pathInfo.getPath());
624    }
625
626    while (true) {
627      BlockReaderPeer curPeer = nextDomainPeer();
628      if (curPeer == null) break;
629      if (curPeer.fromCache) remainingCacheTries--;
630      DomainPeer peer = (DomainPeer)curPeer.peer;
631      BlockReader blockReader = null;
632      try {
633        blockReader = getRemoteBlockReader(peer);
634        return blockReader;
635      } catch (IOException ioe) {
636        IOUtils.cleanup(LOG, peer);
637        if (isSecurityException(ioe)) {
638          if (LOG.isTraceEnabled()) {
639            LOG.trace(this + ": got security exception while constructing " +
640                "a remote block reader from the unix domain socket at " +
641                pathInfo.getPath(), ioe);
642          }
643          throw ioe;
644        }
645        if (curPeer.fromCache) {
646          // Handle an I/O error we got when using a cached peer.  These are
647          // considered less serious, because the underlying socket may be stale.
648          if (LOG.isDebugEnabled()) {
649            LOG.debug("Closed potentially stale domain peer " + peer, ioe);
650          }
651        } else {
652          // Handle an I/O error we got when using a newly created domain peer.
653          // We temporarily disable the domain socket path for a few minutes in
654          // this case, to prevent wasting more time on it.
655          LOG.warn("I/O error constructing remote block reader.  Disabling " +
656              "domain socket " + peer.getDomainSocket(), ioe);
657          clientContext.getDomainSocketFactory()
658              .disableDomainSocketPath(pathInfo.getPath());
659          return null;
660        }
661      } finally {
662        if (blockReader == null) {
663          IOUtils.cleanup(LOG, peer);
664        }
665      }
666    }
667    return null;
668  }
669
670  /**
671   * Get a RemoteBlockReader that communicates over a TCP socket.
672   *
673   * @return The new BlockReader.  We will not return null, but instead throw
674   *         an exception if this fails.
675   *
676   * @throws InvalidToken
677   *             If the block token was invalid.
678   *         InvalidEncryptionKeyException
679   *             If the encryption key was invalid.
680   *         Other IOException
681   *             If there was another problem.
682   */
683  private BlockReader getRemoteBlockReaderFromTcp() throws IOException {
684    if (LOG.isTraceEnabled()) {
685      LOG.trace(this + ": trying to create a remote block reader from a " +
686          "TCP socket");
687    }
688    BlockReader blockReader = null;
689    while (true) {
690      BlockReaderPeer curPeer = null;
691      Peer peer = null;
692      try {
693        curPeer = nextTcpPeer();
694        if (curPeer == null) break;
695        if (curPeer.fromCache) remainingCacheTries--;
696        peer = curPeer.peer;
697        blockReader = getRemoteBlockReader(peer);
698        return blockReader;
699      } catch (IOException ioe) {
700        if (isSecurityException(ioe)) {
701          if (LOG.isTraceEnabled()) {
702            LOG.trace(this + ": got security exception while constructing " +
703                "a remote block reader from " + peer, ioe);
704          }
705          throw ioe;
706        }
707        if ((curPeer != null) && curPeer.fromCache) {
708          // Handle an I/O error we got when using a cached peer.  These are
709          // considered less serious, because the underlying socket may be
710          // stale.
711          if (LOG.isDebugEnabled()) {
712            LOG.debug("Closed potentially stale remote peer " + peer, ioe);
713          }
714        } else {
715          // Handle an I/O error we got when using a newly created peer.
716          LOG.warn("I/O error constructing remote block reader.", ioe);
717          throw ioe;
718        }
719      } finally {
720        if (blockReader == null) {
721          IOUtils.cleanup(LOG, peer);
722        }
723      }
724    }
725    return null;
726  }
727
728  public static class BlockReaderPeer {
729    final Peer peer;
730    final boolean fromCache;
731    
732    BlockReaderPeer(Peer peer, boolean fromCache) {
733      this.peer = peer;
734      this.fromCache = fromCache;
735    }
736  }
737
738  /**
739   * Get the next DomainPeer-- either from the cache or by creating it.
740   *
741   * @return the next DomainPeer, or null if we could not construct one.
742   */
743  private BlockReaderPeer nextDomainPeer() {
744    if (remainingCacheTries > 0) {
745      Peer peer = clientContext.getPeerCache().get(datanode, true);
746      if (peer != null) {
747        if (LOG.isTraceEnabled()) {
748          LOG.trace("nextDomainPeer: reusing existing peer " + peer);
749        }
750        return new BlockReaderPeer(peer, true);
751      }
752    }
753    DomainSocket sock = clientContext.getDomainSocketFactory().
754        createSocket(pathInfo, conf.socketTimeout);
755    if (sock == null) return null;
756    return new BlockReaderPeer(new DomainPeer(sock), false);
757  }
758
759  /**
760   * Get the next TCP-based peer-- either from the cache or by creating it.
761   *
762   * @return the next Peer, or null if we could not construct one.
763   *
764   * @throws IOException  If there was an error while constructing the peer
765   *                      (such as an InvalidEncryptionKeyException)
766   */
767  private BlockReaderPeer nextTcpPeer() throws IOException {
768    if (remainingCacheTries > 0) {
769      Peer peer = clientContext.getPeerCache().get(datanode, false);
770      if (peer != null) {
771        if (LOG.isTraceEnabled()) {
772          LOG.trace("nextTcpPeer: reusing existing peer " + peer);
773        }
774        return new BlockReaderPeer(peer, true);
775      }
776    }
777    try {
778      Peer peer = remotePeerFactory.newConnectedPeer(inetSocketAddress, token,
779        datanode);
780      if (LOG.isTraceEnabled()) {
781        LOG.trace("nextTcpPeer: created newConnectedPeer " + peer);
782      }
783      return new BlockReaderPeer(peer, false);
784    } catch (IOException e) {
785      if (LOG.isTraceEnabled()) {
786        LOG.trace("nextTcpPeer: failed to create newConnectedPeer " +
787                  "connected to " + datanode);
788      }
789      throw e;
790    }
791  }
792
793  /**
794   * Determine if an exception is security-related.
795   *
796   * We need to handle these exceptions differently than other IOExceptions.
797   * They don't indicate a communication problem.  Instead, they mean that there
798   * is some action the client needs to take, such as refetching block tokens,
799   * renewing encryption keys, etc.
800   *
801   * @param ioe    The exception
802   * @return       True only if the exception is security-related.
803   */
804  private static boolean isSecurityException(IOException ioe) {
805    return (ioe instanceof InvalidToken) ||
806            (ioe instanceof InvalidEncryptionKeyException) ||
807            (ioe instanceof InvalidBlockTokenException) ||
808            (ioe instanceof AccessControlException);
809  }
810
811  @SuppressWarnings("deprecation")
812  private BlockReader getRemoteBlockReader(Peer peer) throws IOException {
813    if (conf.useLegacyBlockReader) {
814      return RemoteBlockReader.newBlockReader(fileName,
815          block, token, startOffset, length, conf.ioBufferSize,
816          verifyChecksum, clientName, peer, datanode,
817          clientContext.getPeerCache(), cachingStrategy);
818    } else {
819      return RemoteBlockReader2.newBlockReader(
820          fileName, block, token, startOffset, length,
821          verifyChecksum, clientName, peer, datanode,
822          clientContext.getPeerCache(), cachingStrategy);
823    }
824  }
825
826  @Override
827  public String toString() {
828    return "BlockReaderFactory(fileName=" + fileName + ", block=" + block + ")";
829  }
830
831  /**
832   * File name to print when accessing a block directly (from servlets)
833   * @param s Address of the block location
834   * @param poolId Block pool ID of the block
835   * @param blockId Block ID of the block
836   * @return string that has a file name for debug purposes
837   */
838  public static String getFileName(final InetSocketAddress s,
839      final String poolId, final long blockId) {
840    return s.toString() + ":" + poolId + ":" + blockId;
841  }
842}