001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.client;
019    
020    import com.google.common.annotations.VisibleForTesting;
021    import com.google.common.base.Preconditions;
022    
023    import java.io.BufferedOutputStream;
024    import java.io.Closeable;
025    import java.io.DataOutputStream;
026    import java.io.EOFException;
027    import java.io.FileInputStream;
028    import java.io.IOException;
029    import java.util.HashMap;
030    import java.util.TreeMap;
031    import java.util.Map.Entry;
032    import java.util.concurrent.locks.Condition;
033    import java.util.concurrent.locks.ReentrantLock;
034    
035    import org.apache.commons.lang.mutable.MutableBoolean;
036    import org.apache.commons.logging.Log;
037    import org.apache.commons.logging.LogFactory;
038    import org.apache.hadoop.hdfs.ExtendedBlockId;
039    import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId;
040    import org.apache.hadoop.hdfs.ShortCircuitShm.Slot;
041    import org.apache.hadoop.hdfs.net.DomainPeer;
042    import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
043    import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
044    import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto;
045    import org.apache.hadoop.hdfs.protocolPB.PBHelper;
046    import org.apache.hadoop.io.IOUtils;
047    import org.apache.hadoop.net.unix.DomainSocket;
048    import org.apache.hadoop.net.unix.DomainSocketWatcher;
049    import org.apache.hadoop.classification.InterfaceAudience;
050    
051    /**
052     * Manages short-circuit memory segments for an HDFS client.
053     * 
054     * Clients are responsible for requesting and releasing shared memory segments used
055     * for communicating with the DataNode. The client will try to allocate new slots
056     * in the set of existing segments, falling back to getting a new segment from the
057     * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}.
058     * 
059     * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}.
060     * See {@link ShortCircuitRegistry} for more information on the communication protocol.
061     */
062    @InterfaceAudience.Private
063    public class DfsClientShmManager implements Closeable {
064      private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class);
065    
066      /**
067       * Manages short-circuit memory segments that pertain to a given DataNode.
068       */
069      class EndpointShmManager {
070        /**
071         * The datanode we're managing.
072         */
073        private final DatanodeInfo datanode;
074    
075        /**
076         * Shared memory segments which have no empty slots.
077         *
078         * Protected by the manager lock.
079         */
080        private final TreeMap<ShmId, DfsClientShm> full =
081            new TreeMap<ShmId, DfsClientShm>();
082    
083        /**
084         * Shared memory segments which have at least one empty slot.
085         *
086         * Protected by the manager lock.
087         */
088        private final TreeMap<ShmId, DfsClientShm> notFull =
089            new TreeMap<ShmId, DfsClientShm>();
090    
091        /**
092         * True if this datanode doesn't support short-circuit shared memory
093         * segments.
094         *
095         * Protected by the manager lock.
096         */
097        private boolean disabled = false;
098    
099        /**
100         * True if we're in the process of loading a shared memory segment from
101         * this DataNode.
102         *
103         * Protected by the manager lock.
104         */
105        private boolean loading = false;
106    
107        EndpointShmManager (DatanodeInfo datanode) {
108          this.datanode = datanode;
109        }
110    
111        /**
112         * Pull a slot out of a preexisting shared memory segment.
113         *
114         * Must be called with the manager lock held.
115         *
116         * @param blockId     The blockId to put inside the Slot object.
117         *
118         * @return            null if none of our shared memory segments contain a
119         *                      free slot; the slot object otherwise.
120         */
121        private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) {
122          if (notFull.isEmpty()) {
123            return null;
124          }
125          Entry<ShmId, DfsClientShm> entry = notFull.firstEntry();
126          DfsClientShm shm = entry.getValue();
127          ShmId shmId = shm.getShmId();
128          Slot slot = shm.allocAndRegisterSlot(blockId);
129          if (shm.isFull()) {
130            if (LOG.isTraceEnabled()) {
131              LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() +
132                  " out of " + shm);
133            }
134            DfsClientShm removedShm = notFull.remove(shmId);
135            Preconditions.checkState(removedShm == shm);
136            full.put(shmId, shm);
137          } else {
138            if (LOG.isTraceEnabled()) {
139              LOG.trace(this + ": pulled slot " + slot.getSlotIdx() +
140                  " out of " + shm);
141            }
142          }
143          return slot;
144        }
145    
146        /**
147         * Ask the DataNode for a new shared memory segment.  This function must be
148         * called with the manager lock held.  We will release the lock while
149         * communicating with the DataNode.
150         *
151         * @param clientName    The current client name.
152         * @param peer          The peer to use to talk to the DataNode.
153         *
154         * @return              Null if the DataNode does not support shared memory
155         *                        segments, or experienced an error creating the
156         *                        shm.  The shared memory segment itself on success.
157         * @throws IOException  If there was an error communicating over the socket.
158         *                        We will not throw an IOException unless the socket
159         *                        itself (or the network) is the problem.
160         */
161        private DfsClientShm requestNewShm(String clientName, DomainPeer peer)
162            throws IOException {
163          final DataOutputStream out = 
164              new DataOutputStream(
165                  new BufferedOutputStream(peer.getOutputStream()));
166          new Sender(out).requestShortCircuitShm(clientName);
167          ShortCircuitShmResponseProto resp = 
168              ShortCircuitShmResponseProto.parseFrom(
169                  PBHelper.vintPrefixed(peer.getInputStream()));
170          String error = resp.hasError() ? resp.getError() : "(unknown)";
171          switch (resp.getStatus()) {
172          case SUCCESS:
173            DomainSocket sock = peer.getDomainSocket();
174            byte buf[] = new byte[1];
175            FileInputStream fis[] = new FileInputStream[1];
176            if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) {
177              throw new EOFException("got EOF while trying to transfer the " +
178                  "file descriptor for the shared memory segment.");
179            }
180            if (fis[0] == null) {
181              throw new IOException("the datanode " + datanode + " failed to " +
182                  "pass a file descriptor for the shared memory segment.");
183            }
184            try {
185              DfsClientShm shm = 
186                  new DfsClientShm(PBHelper.convert(resp.getId()),
187                      fis[0], this, peer);
188              if (LOG.isTraceEnabled()) {
189                LOG.trace(this + ": createNewShm: created " + shm);
190              }
191              return shm;
192            } finally {
193              IOUtils.cleanup(LOG,  fis[0]);
194            }
195          case ERROR_UNSUPPORTED:
196            // The DataNode just does not support short-circuit shared memory
197            // access, and we should stop asking.
198            LOG.info(this + ": datanode does not support short-circuit " +
199                "shared memory access: " + error);
200            disabled = true;
201            return null;
202          default:
203            // The datanode experienced some kind of unexpected error when trying to
204            // create the short-circuit shared memory segment.
205            LOG.warn(this + ": error requesting short-circuit shared memory " +
206                "access: " + error);
207            return null;
208          }
209        }
210    
211        /**
212         * Allocate a new shared memory slot connected to this datanode.
213         *
214         * Must be called with the EndpointShmManager lock held.
215         *
216         * @param peer          The peer to use to talk to the DataNode.
217         * @param clientName    The client name.
218         * @param usedPeer      (out param) Will be set to true if we used the peer.
219         *                        When a peer is used
220         *
221         * @return              null if the DataNode does not support shared memory
222         *                        segments, or experienced an error creating the
223         *                        shm.  The shared memory segment itself on success.
224         * @throws IOException  If there was an error communicating over the socket.
225         */
226        Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer,
227            String clientName, ExtendedBlockId blockId) throws IOException {
228          while (true) {
229            if (closed) {
230              if (LOG.isTraceEnabled()) {
231                LOG.trace(this + ": the DfsClientShmManager has been closed.");
232              }
233              return null;
234            }
235            if (disabled) {
236              if (LOG.isTraceEnabled()) {
237                LOG.trace(this + ": shared memory segment access is disabled.");
238              }
239              return null;
240            }
241            // Try to use an existing slot.
242            Slot slot = allocSlotFromExistingShm(blockId);
243            if (slot != null) {
244              return slot;
245            }
246            // There are no free slots.  If someone is loading more slots, wait
247            // for that to finish.
248            if (loading) {
249              if (LOG.isTraceEnabled()) {
250                LOG.trace(this + ": waiting for loading to finish...");
251              }
252              finishedLoading.awaitUninterruptibly();
253            } else {
254              // Otherwise, load the slot ourselves.
255              loading = true;
256              lock.unlock();
257              DfsClientShm shm;
258              try {
259                shm = requestNewShm(clientName, peer);
260                if (shm == null) continue;
261                // See #{DfsClientShmManager#domainSocketWatcher} for details
262                // about why we do this before retaking the manager lock.
263                domainSocketWatcher.add(peer.getDomainSocket(), shm);
264                // The DomainPeer is now our responsibility, and should not be
265                // closed by the caller.
266                usedPeer.setValue(true);
267              } finally {
268                lock.lock();
269                loading = false;
270                finishedLoading.signalAll();
271              }
272              if (shm.isStale()) {
273                // If the peer closed immediately after the shared memory segment
274                // was created, the DomainSocketWatcher callback might already have
275                // fired and marked the shm as stale.  In this case, we obviously
276                // don't want to add the SharedMemorySegment to our list of valid
277                // not-full segments.
278                if (LOG.isDebugEnabled()) {
279                  LOG.debug(this + ": the UNIX domain socket associated with " +
280                      "this short-circuit memory closed before we could make " +
281                      "use of the shm.");
282                }
283              } else {
284                notFull.put(shm.getShmId(), shm);
285              }
286            }
287          }
288        }
289        
290        /**
291         * Stop tracking a slot.
292         *
293         * Must be called with the EndpointShmManager lock held.
294         *
295         * @param slot          The slot to release.
296         */
297        void freeSlot(Slot slot) {
298          DfsClientShm shm = (DfsClientShm)slot.getShm();
299          shm.unregisterSlot(slot.getSlotIdx());
300          if (shm.isStale()) {
301            // Stale shared memory segments should not be tracked here.
302            Preconditions.checkState(!full.containsKey(shm.getShmId()));
303            Preconditions.checkState(!notFull.containsKey(shm.getShmId()));
304            if (shm.isEmpty()) {
305              if (LOG.isTraceEnabled()) {
306                LOG.trace(this + ": freeing empty stale " + shm);
307              }
308              shm.free();
309            }
310          } else {
311            ShmId shmId = shm.getShmId();
312            full.remove(shmId); // The shm can't be full if we just freed a slot.
313            if (shm.isEmpty()) {
314              notFull.remove(shmId);
315      
316              // If the shared memory segment is now empty, we call shutdown(2) on
317              // the UNIX domain socket associated with it.  The DomainSocketWatcher,
318              // which is watching this socket, will call DfsClientShm#handle,
319              // cleaning up this shared memory segment.
320              //
321              // See #{DfsClientShmManager#domainSocketWatcher} for details about why
322              // we don't want to call DomainSocketWatcher#remove directly here.
323              //
324              // Note that we could experience 'fragmentation' here, where the
325              // DFSClient allocates a bunch of slots in different shared memory
326              // segments, and then frees most of them, but never fully empties out
327              // any segment.  We make some attempt to avoid this fragmentation by
328              // always allocating new slots out of the shared memory segment with the
329              // lowest ID, but it could still occur.  In most workloads,
330              // fragmentation should not be a major concern, since it doesn't impact
331              // peak file descriptor usage or the speed of allocation.
332              if (LOG.isTraceEnabled()) {
333                LOG.trace(this + ": shutting down UNIX domain socket for " +
334                    "empty " + shm);
335              }
336              shutdown(shm);
337            } else {
338              notFull.put(shmId, shm);
339            }
340          }
341        }
342        
343        /**
344         * Unregister a shared memory segment.
345         *
346         * Once a segment is unregistered, we will not allocate any more slots
347         * inside that segment.
348         *
349         * The DomainSocketWatcher calls this while holding the DomainSocketWatcher
350         * lock.
351         *
352         * @param shmId         The ID of the shared memory segment to unregister.
353         */
354        void unregisterShm(ShmId shmId) {
355          lock.lock();
356          try {
357            full.remove(shmId);
358            notFull.remove(shmId);
359          } finally {
360            lock.unlock();
361          }
362        }
363    
364        @Override
365        public String toString() {
366          return String.format("EndpointShmManager(%s, parent=%s)",
367              datanode, DfsClientShmManager.this);
368        }
369    
370        PerDatanodeVisitorInfo getVisitorInfo() {
371          return new PerDatanodeVisitorInfo(full, notFull, disabled);
372        }
373    
374        final void shutdown(DfsClientShm shm) {
375          try {
376            shm.getPeer().getDomainSocket().shutdown();
377          } catch (IOException e) {
378            LOG.warn(this + ": error shutting down shm: got IOException calling " +
379                "shutdown(SHUT_RDWR)", e);
380          }
381        }
382      }
383    
384      private boolean closed = false;
385    
386      private final ReentrantLock lock = new ReentrantLock();
387    
388      /**
389       * A condition variable which is signalled when we finish loading a segment
390       * from the Datanode.
391       */
392      private final Condition finishedLoading = lock.newCondition();
393    
394      /**
395       * Information about each Datanode.
396       */
397      private final HashMap<DatanodeInfo, EndpointShmManager> datanodes =
398          new HashMap<DatanodeInfo, EndpointShmManager>(1);
399      
400      /**
401       * The DomainSocketWatcher which keeps track of the UNIX domain socket
402       * associated with each shared memory segment.
403       *
404       * Note: because the DomainSocketWatcher makes callbacks into this
405       * DfsClientShmManager object, you must MUST NOT attempt to take the
406       * DomainSocketWatcher lock while holding the DfsClientShmManager lock,
407       * or else deadlock might result.   This means that most DomainSocketWatcher
408       * methods are off-limits unless you release the manager lock first.
409       */
410      private final DomainSocketWatcher domainSocketWatcher;
411      
412      DfsClientShmManager(int interruptCheckPeriodMs) throws IOException {
413        this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs);
414      }
415      
416      public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer,
417          MutableBoolean usedPeer, ExtendedBlockId blockId,
418          String clientName) throws IOException {
419        lock.lock();
420        try {
421          if (closed) {
422            LOG.trace(this + ": the DfsClientShmManager isclosed.");
423            return null;
424          }
425          EndpointShmManager shmManager = datanodes.get(datanode);
426          if (shmManager == null) {
427            shmManager = new EndpointShmManager(datanode);
428            datanodes.put(datanode, shmManager);
429          }
430          return shmManager.allocSlot(peer, usedPeer, clientName, blockId);
431        } finally {
432          lock.unlock();
433        }
434      }
435      
436      public void freeSlot(Slot slot) {
437        lock.lock();
438        try {
439          DfsClientShm shm = (DfsClientShm)slot.getShm();
440          shm.getEndpointShmManager().freeSlot(slot);
441        } finally {
442          lock.unlock();
443        }
444      }
445    
446      @VisibleForTesting
447      public static class PerDatanodeVisitorInfo {
448        public final TreeMap<ShmId, DfsClientShm> full;
449        public final TreeMap<ShmId, DfsClientShm> notFull;
450        public final boolean disabled;
451    
452        PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full,
453            TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) {
454          this.full = full;
455          this.notFull = notFull;
456          this.disabled = disabled;
457        }
458      }
459    
460      @VisibleForTesting
461      public interface Visitor {
462        void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info)
463            throws IOException;
464      }
465    
466      @VisibleForTesting
467      public void visit(Visitor visitor) throws IOException {
468        lock.lock();
469        try {
470          HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 
471              new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>();
472          for (Entry<DatanodeInfo, EndpointShmManager> entry :
473                datanodes.entrySet()) {
474            info.put(entry.getKey(), entry.getValue().getVisitorInfo());
475          }
476          visitor.visit(info);
477        } finally {
478          lock.unlock();
479        }
480      }
481    
482      /**
483       * Close the DfsClientShmManager.
484       */
485      @Override
486      public void close() throws IOException {
487        lock.lock();
488        try {
489          if (closed) return;
490          closed = true;
491        } finally {
492          lock.unlock();
493        }
494        // When closed, the domainSocketWatcher will issue callbacks that mark
495        // all the outstanding DfsClientShm segments as stale.
496        IOUtils.cleanup(LOG, domainSocketWatcher);
497      }
498    
499    
500      @Override
501      public String toString() {
502        return String.format("ShortCircuitShmManager(%08x)",
503            System.identityHashCode(this));
504      }
505    
506      @VisibleForTesting
507      public DomainSocketWatcher getDomainSocketWatcher() {
508        return domainSocketWatcher;
509      }
510    }