001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs;
019    
020    import java.io.FileInputStream;
021    import java.io.IOException;
022    import java.lang.reflect.Field;
023    import java.util.BitSet;
024    import java.util.Iterator;
025    import java.util.NoSuchElementException;
026    import java.util.Random;
027    
028    import org.apache.commons.lang.builder.EqualsBuilder;
029    import org.apache.commons.lang.builder.HashCodeBuilder;
030    import org.apache.commons.logging.Log;
031    import org.apache.commons.logging.LogFactory;
032    import org.apache.hadoop.fs.InvalidRequestException;
033    import org.apache.hadoop.io.nativeio.NativeIO;
034    import org.apache.hadoop.io.nativeio.NativeIO.POSIX;
035    import org.apache.hadoop.util.Shell;
036    import org.apache.hadoop.util.StringUtils;
037    
038    import com.google.common.base.Preconditions;
039    import com.google.common.collect.ComparisonChain;
040    import com.google.common.primitives.Ints;
041    
042    import sun.misc.Unsafe;
043    
044    /**
045     * A shared memory segment used to implement short-circuit reads.
046     */
047    public class ShortCircuitShm {
048      private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class);
049    
050      protected static final int BYTES_PER_SLOT = 64;
051    
052      private static final Unsafe unsafe = safetyDance();
053    
054      private static Unsafe safetyDance() {
055        try {
056          Field f = Unsafe.class.getDeclaredField("theUnsafe");
057          f.setAccessible(true);
058          return (Unsafe)f.get(null);
059        } catch (Throwable e) {
060          LOG.error("failed to load misc.Unsafe", e);
061        }
062        return null;
063      }
064    
065      /**
066       * Calculate the usable size of a shared memory segment.
067       * We round down to a multiple of the slot size and do some validation.
068       *
069       * @param stream The stream we're using.
070       * @return       The usable size of the shared memory segment.
071       */
072      private static int getUsableLength(FileInputStream stream)
073          throws IOException {
074        int intSize = Ints.checkedCast(stream.getChannel().size());
075        int slots = intSize / BYTES_PER_SLOT;
076        if (slots == 0) {
077          throw new IOException("size of shared memory segment was " +
078              intSize + ", but that is not enough to hold even one slot.");
079        }
080        return slots * BYTES_PER_SLOT;
081      }
082    
083      /**
084       * Identifies a DfsClientShm.
085       */
086      public static class ShmId implements Comparable<ShmId> {
087        private static final Random random = new Random();
088        private final long hi;
089        private final long lo;
090    
091        /**
092         * Generate a random ShmId.
093         * 
094         * We generate ShmIds randomly to prevent a malicious client from
095         * successfully guessing one and using that to interfere with another
096         * client.
097         */
098        public static ShmId createRandom() {
099          return new ShmId(random.nextLong(), random.nextLong());
100        }
101    
102        public ShmId(long hi, long lo) {
103          this.hi = hi;
104          this.lo = lo;
105        }
106        
107        public long getHi() {
108          return hi;
109        }
110        
111        public long getLo() {
112          return lo;
113        }
114    
115        @Override
116        public boolean equals(Object o) {
117          if ((o == null) || (o.getClass() != this.getClass())) {
118            return false;
119          }
120          ShmId other = (ShmId)o;
121          return new EqualsBuilder().
122              append(hi, other.hi).
123              append(lo, other.lo).
124              isEquals();
125        }
126    
127        @Override
128        public int hashCode() {
129          return new HashCodeBuilder().
130              append(this.hi).
131              append(this.lo).
132              toHashCode();
133        }
134    
135        @Override
136        public String toString() {
137          return String.format("%016x%016x", hi, lo);
138        }
139    
140        @Override
141        public int compareTo(ShmId other) {
142          return ComparisonChain.start().
143              compare(hi, other.hi).
144              compare(lo, other.lo).
145              result();
146        }
147      };
148    
149      /**
150       * Uniquely identifies a slot.
151       */
152      public static class SlotId {
153        private final ShmId shmId;
154        private final int slotIdx;
155        
156        public SlotId(ShmId shmId, int slotIdx) {
157          this.shmId = shmId;
158          this.slotIdx = slotIdx;
159        }
160    
161        public ShmId getShmId() {
162          return shmId;
163        }
164    
165        public int getSlotIdx() {
166          return slotIdx;
167        }
168    
169        @Override
170        public boolean equals(Object o) {
171          if ((o == null) || (o.getClass() != this.getClass())) {
172            return false;
173          }
174          SlotId other = (SlotId)o;
175          return new EqualsBuilder().
176              append(shmId, other.shmId).
177              append(slotIdx, other.slotIdx).
178              isEquals();
179        }
180    
181        @Override
182        public int hashCode() {
183          return new HashCodeBuilder().
184              append(this.shmId).
185              append(this.slotIdx).
186              toHashCode();
187        }
188    
189        @Override
190        public String toString() {
191          return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx);
192        }
193      }
194    
195      public class SlotIterator implements Iterator<Slot> {
196        int slotIdx = -1;
197    
198        @Override
199        public boolean hasNext() {
200          synchronized (ShortCircuitShm.this) {
201            return allocatedSlots.nextSetBit(slotIdx + 1) != -1;
202          }
203        }
204    
205        @Override
206        public Slot next() {
207          synchronized (ShortCircuitShm.this) {
208            int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1);
209            if (nextSlotIdx == -1) {
210              throw new NoSuchElementException();
211            }
212            slotIdx = nextSlotIdx;
213            return slots[nextSlotIdx];
214          }
215        }
216    
217        @Override
218        public void remove() {
219          throw new UnsupportedOperationException("SlotIterator " +
220              "doesn't support removal");
221        }
222      }
223      
224      /**
225       * A slot containing information about a replica.
226       *
227       * The format is:
228       * word 0
229       *   bit 0:32   Slot flags (see below).
230       *   bit 33:63  Anchor count.
231       * word 1:7
232       *   Reserved for future use, such as statistics.
233       *   Padding is also useful for avoiding false sharing.
234       *
235       * Little-endian versus big-endian is not relevant here since both the client
236       * and the server reside on the same computer and use the same orientation.
237       */
238      public class Slot {
239        /**
240         * Flag indicating that the slot is valid.  
241         * 
242         * The DFSClient sets this flag when it allocates a new slot within one of
243         * its shared memory regions.
244         * 
245         * The DataNode clears this flag when the replica associated with this slot
246         * is no longer valid.  The client itself also clears this flag when it
247         * believes that the DataNode is no longer using this slot to communicate.
248         */
249        private static final long VALID_FLAG =          1L<<63;
250    
251        /**
252         * Flag indicating that the slot can be anchored.
253         */
254        private static final long ANCHORABLE_FLAG =     1L<<62;
255    
256        /**
257         * The slot address in memory.
258         */
259        private final long slotAddress;
260    
261        /**
262         * BlockId of the block this slot is used for.
263         */
264        private final ExtendedBlockId blockId;
265    
266        Slot(long slotAddress, ExtendedBlockId blockId) {
267          this.slotAddress = slotAddress;
268          this.blockId = blockId;
269        }
270    
271        /**
272         * Get the short-circuit memory segment associated with this Slot.
273         *
274         * @return      The enclosing short-circuit memory segment.
275         */
276        public ShortCircuitShm getShm() {
277          return ShortCircuitShm.this;
278        }
279    
280        /**
281         * Get the ExtendedBlockId associated with this slot.
282         *
283         * @return      The ExtendedBlockId of this slot.
284         */
285        public ExtendedBlockId getBlockId() {
286          return blockId;
287        }
288    
289        /**
290         * Get the SlotId of this slot, containing both shmId and slotIdx.
291         *
292         * @return      The SlotId of this slot.
293         */
294        public SlotId getSlotId() {
295          return new SlotId(getShmId(), getSlotIdx());
296        }
297    
298        /**
299         * Get the Slot index.
300         *
301         * @return      The index of this slot.
302         */
303        public int getSlotIdx() {
304          return Ints.checkedCast(
305              (slotAddress - baseAddress) / BYTES_PER_SLOT);
306        }
307    
308        private boolean isSet(long flag) {
309          long prev = unsafe.getLongVolatile(null, this.slotAddress);
310          return (prev & flag) != 0;
311        }
312    
313        private void setFlag(long flag) {
314          long prev;
315          do {
316            prev = unsafe.getLongVolatile(null, this.slotAddress);
317            if ((prev & flag) != 0) {
318              return;
319            }
320          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
321                      prev, prev | flag));
322        }
323    
324        private void clearFlag(long flag) {
325          long prev;
326          do {
327            prev = unsafe.getLongVolatile(null, this.slotAddress);
328            if ((prev & flag) == 0) {
329              return;
330            }
331          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
332                      prev, prev & (~flag)));
333        }
334        
335        public boolean isValid() {
336          return isSet(VALID_FLAG);
337        }
338    
339        public void makeValid() {
340          setFlag(VALID_FLAG);
341        }
342    
343        public void makeInvalid() {
344          clearFlag(VALID_FLAG);
345        }
346    
347        public boolean isAnchorable() {
348          return isSet(ANCHORABLE_FLAG);
349        }
350    
351        public void makeAnchorable() {
352          setFlag(ANCHORABLE_FLAG);
353        }
354    
355        public void makeUnanchorable() {
356          clearFlag(ANCHORABLE_FLAG);
357        }
358    
359        public boolean isAnchored() {
360          long prev = unsafe.getLongVolatile(null, this.slotAddress);
361          if ((prev & VALID_FLAG) == 0) {
362            // Slot is no longer valid.
363            return false;
364          }
365          return ((prev & 0x7fffffff) != 0);
366        }
367    
368        /**
369         * Try to add an anchor for a given slot.
370         *
371         * When a slot is anchored, we know that the block it refers to is resident
372         * in memory.
373         *
374         * @return          True if the slot is anchored.
375         */
376        public boolean addAnchor() {
377          long prev;
378          do {
379            prev = unsafe.getLongVolatile(null, this.slotAddress);
380            if ((prev & VALID_FLAG) == 0) {
381              // Slot is no longer valid.
382              return false;
383            }
384            if ((prev & ANCHORABLE_FLAG) == 0) {
385              // Slot can't be anchored right now.
386              return false;
387            }
388            if ((prev & 0x7fffffff) == 0x7fffffff) {
389              // Too many other threads have anchored the slot (2 billion?)
390              return false;
391            }
392          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
393                      prev, prev + 1));
394          return true;
395        }
396    
397        /**
398         * Remove an anchor for a given slot.
399         */
400        public void removeAnchor() {
401          long prev;
402          do {
403            prev = unsafe.getLongVolatile(null, this.slotAddress);
404            Preconditions.checkState((prev & 0x7fffffff) != 0,
405                "Tried to remove anchor for slot " + slotAddress +", which was " +
406                "not anchored.");
407          } while (!unsafe.compareAndSwapLong(null, this.slotAddress,
408                      prev, prev - 1));
409        }
410    
411        @Override
412        public String toString() {
413          return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")";
414        }
415      }
416    
417      /**
418       * ID for this SharedMemorySegment.
419       */
420      private final ShmId shmId;
421    
422      /**
423       * The base address of the memory-mapped file.
424       */
425      private final long baseAddress;
426    
427      /**
428       * The mmapped length of the shared memory segment
429       */
430      private final int mmappedLength;
431    
432      /**
433       * The slots associated with this shared memory segment.
434       * slot[i] contains the slot at offset i * BYTES_PER_SLOT,
435       * or null if that slot is not allocated.
436       */
437      private final Slot slots[];
438    
439      /**
440       * A bitset where each bit represents a slot which is in use.
441       */
442      private final BitSet allocatedSlots;
443    
444      /**
445       * Create the ShortCircuitShm.
446       * 
447       * @param shmId       The ID to use.
448       * @param stream      The stream that we're going to use to create this 
449       *                    shared memory segment.
450       *                    
451       *                    Although this is a FileInputStream, we are going to
452       *                    assume that the underlying file descriptor is writable
453       *                    as well as readable. It would be more appropriate to use
454       *                    a RandomAccessFile here, but that class does not have
455       *                    any public accessor which returns a FileDescriptor,
456       *                    unlike FileInputStream.
457       */
458      public ShortCircuitShm(ShmId shmId, FileInputStream stream)
459            throws IOException {
460        if (!NativeIO.isAvailable()) {
461          throw new UnsupportedOperationException("NativeIO is not available.");
462        }
463        if (Shell.WINDOWS) {
464          throw new UnsupportedOperationException(
465              "DfsClientShm is not yet implemented for Windows.");
466        }
467        if (unsafe == null) {
468          throw new UnsupportedOperationException(
469              "can't use DfsClientShm because we failed to " +
470              "load misc.Unsafe.");
471        }
472        this.shmId = shmId;
473        this.mmappedLength = getUsableLength(stream);
474        this.baseAddress = POSIX.mmap(stream.getFD(), 
475            POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength);
476        this.slots = new Slot[mmappedLength / BYTES_PER_SLOT];
477        this.allocatedSlots = new BitSet(slots.length);
478        if (LOG.isTraceEnabled()) {
479          LOG.trace("creating " + this.getClass().getSimpleName() +
480              "(shmId=" + shmId +
481              ", mmappedLength=" + mmappedLength +
482              ", baseAddress=" + String.format("%x", baseAddress) +
483              ", slots.length=" + slots.length + ")");
484        }
485      }
486    
487      public final ShmId getShmId() {
488        return shmId;
489      }
490      
491      /**
492       * Determine if this shared memory object is empty.
493       *
494       * @return    True if the shared memory object is empty.
495       */
496      synchronized final public boolean isEmpty() {
497        return allocatedSlots.nextSetBit(0) == -1;
498      }
499    
500      /**
501       * Determine if this shared memory object is full.
502       *
503       * @return    True if the shared memory object is full.
504       */
505      synchronized final public boolean isFull() {
506        return allocatedSlots.nextClearBit(0) >= slots.length;
507      }
508    
509      /**
510       * Calculate the base address of a slot.
511       *
512       * @param slotIdx   Index of the slot.
513       * @return          The base address of the slot.
514       */
515      private final long calculateSlotAddress(int slotIdx) {
516        long offset = slotIdx;
517        offset *= BYTES_PER_SLOT;
518        return this.baseAddress + offset;
519      }
520    
521      /**
522       * Allocate a new slot and register it.
523       *
524       * This function chooses an empty slot, initializes it, and then returns
525       * the relevant Slot object.
526       *
527       * @return    The new slot.
528       */
529      synchronized public final Slot allocAndRegisterSlot(
530          ExtendedBlockId blockId) {
531        int idx = allocatedSlots.nextClearBit(0);
532        if (idx >= slots.length) {
533          throw new RuntimeException(this + ": no more slots are available.");
534        }
535        allocatedSlots.set(idx, true);
536        Slot slot = new Slot(calculateSlotAddress(idx), blockId);
537        slot.makeValid();
538        slots[idx] = slot;
539        if (LOG.isTraceEnabled()) {
540          LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots +
541                      StringUtils.getStackTrace(Thread.currentThread()));
542        }
543        return slot;
544      }
545    
546      synchronized public final Slot getSlot(int slotIdx)
547          throws InvalidRequestException {
548        if (!allocatedSlots.get(slotIdx)) {
549          throw new InvalidRequestException(this + ": slot " + slotIdx +
550              " does not exist.");
551        }
552        return slots[slotIdx];
553      }
554    
555      /**
556       * Register a slot.
557       *
558       * This function looks at a slot which has already been initialized (by
559       * another process), and registers it with us.  Then, it returns the 
560       * relevant Slot object.
561       *
562       * @return    The slot.
563       *
564       * @throws InvalidRequestException
565       *            If the slot index we're trying to allocate has not been
566       *            initialized, or is already in use.
567       */
568      synchronized public final Slot registerSlot(int slotIdx,
569          ExtendedBlockId blockId) throws InvalidRequestException {
570        if (slotIdx < 0) {
571          throw new InvalidRequestException(this + ": invalid negative slot " +
572              "index " + slotIdx);
573        }
574        if (slotIdx >= slots.length) {
575          throw new InvalidRequestException(this + ": invalid slot " +
576              "index " + slotIdx);
577        }
578        if (allocatedSlots.get(slotIdx)) {
579          throw new InvalidRequestException(this + ": slot " + slotIdx +
580              " is already in use.");
581        }
582        Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId);
583        if (!slot.isValid()) {
584          throw new InvalidRequestException(this + ": slot " + slotIdx +
585              " has not been allocated.");
586        }
587        slots[slotIdx] = slot;
588        allocatedSlots.set(slotIdx, true);
589        if (LOG.isTraceEnabled()) {
590          LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots +
591                      StringUtils.getStackTrace(Thread.currentThread()));
592        }
593        return slot;
594      }
595    
596      /**
597       * Unregisters a slot.
598       * 
599       * This doesn't alter the contents of the slot.  It just means
600       *
601       * @param slotIdx  Index of the slot to unregister.
602       */
603      synchronized public final void unregisterSlot(int slotIdx) {
604        Preconditions.checkState(allocatedSlots.get(slotIdx),
605            "tried to unregister slot " + slotIdx + ", which was not registered.");
606        allocatedSlots.set(slotIdx, false);
607        slots[slotIdx] = null;
608        if (LOG.isTraceEnabled()) {
609          LOG.trace(this + ": unregisterSlot " + slotIdx);
610        }
611      }
612      
613      /**
614       * Iterate over all allocated slots.
615       * 
616       * Note that this method isn't safe if 
617       *
618       * @return        The slot iterator.
619       */
620      public SlotIterator slotIterator() {
621        return new SlotIterator();
622      }
623    
624      public void free() {
625        try {
626          POSIX.munmap(baseAddress, mmappedLength);
627        } catch (IOException e) {
628          LOG.warn(this + ": failed to munmap", e);
629        }
630        LOG.trace(this + ": freed");
631      }
632      
633      @Override
634      public String toString() {
635        return this.getClass().getSimpleName() + "(" + shmId + ")";
636      }
637    }