001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs; 019 020 import java.io.FileInputStream; 021 import java.io.IOException; 022 import java.lang.reflect.Field; 023 import java.util.BitSet; 024 import java.util.Iterator; 025 import java.util.NoSuchElementException; 026 import java.util.Random; 027 028 import org.apache.commons.lang.builder.EqualsBuilder; 029 import org.apache.commons.lang.builder.HashCodeBuilder; 030 import org.apache.commons.logging.Log; 031 import org.apache.commons.logging.LogFactory; 032 import org.apache.hadoop.fs.InvalidRequestException; 033 import org.apache.hadoop.io.nativeio.NativeIO; 034 import org.apache.hadoop.io.nativeio.NativeIO.POSIX; 035 import org.apache.hadoop.util.Shell; 036 import org.apache.hadoop.util.StringUtils; 037 038 import com.google.common.base.Preconditions; 039 import com.google.common.collect.ComparisonChain; 040 import com.google.common.primitives.Ints; 041 042 import sun.misc.Unsafe; 043 044 /** 045 * A shared memory segment used to implement short-circuit reads. 046 */ 047 public class ShortCircuitShm { 048 private static final Log LOG = LogFactory.getLog(ShortCircuitShm.class); 049 050 protected static final int BYTES_PER_SLOT = 64; 051 052 private static final Unsafe unsafe = safetyDance(); 053 054 private static Unsafe safetyDance() { 055 try { 056 Field f = Unsafe.class.getDeclaredField("theUnsafe"); 057 f.setAccessible(true); 058 return (Unsafe)f.get(null); 059 } catch (Throwable e) { 060 LOG.error("failed to load misc.Unsafe", e); 061 } 062 return null; 063 } 064 065 /** 066 * Calculate the usable size of a shared memory segment. 067 * We round down to a multiple of the slot size and do some validation. 068 * 069 * @param stream The stream we're using. 070 * @return The usable size of the shared memory segment. 071 */ 072 private static int getUsableLength(FileInputStream stream) 073 throws IOException { 074 int intSize = Ints.checkedCast(stream.getChannel().size()); 075 int slots = intSize / BYTES_PER_SLOT; 076 if (slots == 0) { 077 throw new IOException("size of shared memory segment was " + 078 intSize + ", but that is not enough to hold even one slot."); 079 } 080 return slots * BYTES_PER_SLOT; 081 } 082 083 /** 084 * Identifies a DfsClientShm. 085 */ 086 public static class ShmId implements Comparable<ShmId> { 087 private static final Random random = new Random(); 088 private final long hi; 089 private final long lo; 090 091 /** 092 * Generate a random ShmId. 093 * 094 * We generate ShmIds randomly to prevent a malicious client from 095 * successfully guessing one and using that to interfere with another 096 * client. 097 */ 098 public static ShmId createRandom() { 099 return new ShmId(random.nextLong(), random.nextLong()); 100 } 101 102 public ShmId(long hi, long lo) { 103 this.hi = hi; 104 this.lo = lo; 105 } 106 107 public long getHi() { 108 return hi; 109 } 110 111 public long getLo() { 112 return lo; 113 } 114 115 @Override 116 public boolean equals(Object o) { 117 if ((o == null) || (o.getClass() != this.getClass())) { 118 return false; 119 } 120 ShmId other = (ShmId)o; 121 return new EqualsBuilder(). 122 append(hi, other.hi). 123 append(lo, other.lo). 124 isEquals(); 125 } 126 127 @Override 128 public int hashCode() { 129 return new HashCodeBuilder(). 130 append(this.hi). 131 append(this.lo). 132 toHashCode(); 133 } 134 135 @Override 136 public String toString() { 137 return String.format("%016x%016x", hi, lo); 138 } 139 140 @Override 141 public int compareTo(ShmId other) { 142 return ComparisonChain.start(). 143 compare(hi, other.hi). 144 compare(lo, other.lo). 145 result(); 146 } 147 }; 148 149 /** 150 * Uniquely identifies a slot. 151 */ 152 public static class SlotId { 153 private final ShmId shmId; 154 private final int slotIdx; 155 156 public SlotId(ShmId shmId, int slotIdx) { 157 this.shmId = shmId; 158 this.slotIdx = slotIdx; 159 } 160 161 public ShmId getShmId() { 162 return shmId; 163 } 164 165 public int getSlotIdx() { 166 return slotIdx; 167 } 168 169 @Override 170 public boolean equals(Object o) { 171 if ((o == null) || (o.getClass() != this.getClass())) { 172 return false; 173 } 174 SlotId other = (SlotId)o; 175 return new EqualsBuilder(). 176 append(shmId, other.shmId). 177 append(slotIdx, other.slotIdx). 178 isEquals(); 179 } 180 181 @Override 182 public int hashCode() { 183 return new HashCodeBuilder(). 184 append(this.shmId). 185 append(this.slotIdx). 186 toHashCode(); 187 } 188 189 @Override 190 public String toString() { 191 return String.format("SlotId(%s:%d)", shmId.toString(), slotIdx); 192 } 193 } 194 195 public class SlotIterator implements Iterator<Slot> { 196 int slotIdx = -1; 197 198 @Override 199 public boolean hasNext() { 200 synchronized (ShortCircuitShm.this) { 201 return allocatedSlots.nextSetBit(slotIdx + 1) != -1; 202 } 203 } 204 205 @Override 206 public Slot next() { 207 synchronized (ShortCircuitShm.this) { 208 int nextSlotIdx = allocatedSlots.nextSetBit(slotIdx + 1); 209 if (nextSlotIdx == -1) { 210 throw new NoSuchElementException(); 211 } 212 slotIdx = nextSlotIdx; 213 return slots[nextSlotIdx]; 214 } 215 } 216 217 @Override 218 public void remove() { 219 throw new UnsupportedOperationException("SlotIterator " + 220 "doesn't support removal"); 221 } 222 } 223 224 /** 225 * A slot containing information about a replica. 226 * 227 * The format is: 228 * word 0 229 * bit 0:32 Slot flags (see below). 230 * bit 33:63 Anchor count. 231 * word 1:7 232 * Reserved for future use, such as statistics. 233 * Padding is also useful for avoiding false sharing. 234 * 235 * Little-endian versus big-endian is not relevant here since both the client 236 * and the server reside on the same computer and use the same orientation. 237 */ 238 public class Slot { 239 /** 240 * Flag indicating that the slot is valid. 241 * 242 * The DFSClient sets this flag when it allocates a new slot within one of 243 * its shared memory regions. 244 * 245 * The DataNode clears this flag when the replica associated with this slot 246 * is no longer valid. The client itself also clears this flag when it 247 * believes that the DataNode is no longer using this slot to communicate. 248 */ 249 private static final long VALID_FLAG = 1L<<63; 250 251 /** 252 * Flag indicating that the slot can be anchored. 253 */ 254 private static final long ANCHORABLE_FLAG = 1L<<62; 255 256 /** 257 * The slot address in memory. 258 */ 259 private final long slotAddress; 260 261 /** 262 * BlockId of the block this slot is used for. 263 */ 264 private final ExtendedBlockId blockId; 265 266 Slot(long slotAddress, ExtendedBlockId blockId) { 267 this.slotAddress = slotAddress; 268 this.blockId = blockId; 269 } 270 271 /** 272 * Get the short-circuit memory segment associated with this Slot. 273 * 274 * @return The enclosing short-circuit memory segment. 275 */ 276 public ShortCircuitShm getShm() { 277 return ShortCircuitShm.this; 278 } 279 280 /** 281 * Get the ExtendedBlockId associated with this slot. 282 * 283 * @return The ExtendedBlockId of this slot. 284 */ 285 public ExtendedBlockId getBlockId() { 286 return blockId; 287 } 288 289 /** 290 * Get the SlotId of this slot, containing both shmId and slotIdx. 291 * 292 * @return The SlotId of this slot. 293 */ 294 public SlotId getSlotId() { 295 return new SlotId(getShmId(), getSlotIdx()); 296 } 297 298 /** 299 * Get the Slot index. 300 * 301 * @return The index of this slot. 302 */ 303 public int getSlotIdx() { 304 return Ints.checkedCast( 305 (slotAddress - baseAddress) / BYTES_PER_SLOT); 306 } 307 308 private boolean isSet(long flag) { 309 long prev = unsafe.getLongVolatile(null, this.slotAddress); 310 return (prev & flag) != 0; 311 } 312 313 private void setFlag(long flag) { 314 long prev; 315 do { 316 prev = unsafe.getLongVolatile(null, this.slotAddress); 317 if ((prev & flag) != 0) { 318 return; 319 } 320 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 321 prev, prev | flag)); 322 } 323 324 private void clearFlag(long flag) { 325 long prev; 326 do { 327 prev = unsafe.getLongVolatile(null, this.slotAddress); 328 if ((prev & flag) == 0) { 329 return; 330 } 331 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 332 prev, prev & (~flag))); 333 } 334 335 public boolean isValid() { 336 return isSet(VALID_FLAG); 337 } 338 339 public void makeValid() { 340 setFlag(VALID_FLAG); 341 } 342 343 public void makeInvalid() { 344 clearFlag(VALID_FLAG); 345 } 346 347 public boolean isAnchorable() { 348 return isSet(ANCHORABLE_FLAG); 349 } 350 351 public void makeAnchorable() { 352 setFlag(ANCHORABLE_FLAG); 353 } 354 355 public void makeUnanchorable() { 356 clearFlag(ANCHORABLE_FLAG); 357 } 358 359 public boolean isAnchored() { 360 long prev = unsafe.getLongVolatile(null, this.slotAddress); 361 if ((prev & VALID_FLAG) == 0) { 362 // Slot is no longer valid. 363 return false; 364 } 365 return ((prev & 0x7fffffff) != 0); 366 } 367 368 /** 369 * Try to add an anchor for a given slot. 370 * 371 * When a slot is anchored, we know that the block it refers to is resident 372 * in memory. 373 * 374 * @return True if the slot is anchored. 375 */ 376 public boolean addAnchor() { 377 long prev; 378 do { 379 prev = unsafe.getLongVolatile(null, this.slotAddress); 380 if ((prev & VALID_FLAG) == 0) { 381 // Slot is no longer valid. 382 return false; 383 } 384 if ((prev & ANCHORABLE_FLAG) == 0) { 385 // Slot can't be anchored right now. 386 return false; 387 } 388 if ((prev & 0x7fffffff) == 0x7fffffff) { 389 // Too many other threads have anchored the slot (2 billion?) 390 return false; 391 } 392 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 393 prev, prev + 1)); 394 return true; 395 } 396 397 /** 398 * Remove an anchor for a given slot. 399 */ 400 public void removeAnchor() { 401 long prev; 402 do { 403 prev = unsafe.getLongVolatile(null, this.slotAddress); 404 Preconditions.checkState((prev & 0x7fffffff) != 0, 405 "Tried to remove anchor for slot " + slotAddress +", which was " + 406 "not anchored."); 407 } while (!unsafe.compareAndSwapLong(null, this.slotAddress, 408 prev, prev - 1)); 409 } 410 411 @Override 412 public String toString() { 413 return "Slot(slotIdx=" + getSlotIdx() + ", shm=" + getShm() + ")"; 414 } 415 } 416 417 /** 418 * ID for this SharedMemorySegment. 419 */ 420 private final ShmId shmId; 421 422 /** 423 * The base address of the memory-mapped file. 424 */ 425 private final long baseAddress; 426 427 /** 428 * The mmapped length of the shared memory segment 429 */ 430 private final int mmappedLength; 431 432 /** 433 * The slots associated with this shared memory segment. 434 * slot[i] contains the slot at offset i * BYTES_PER_SLOT, 435 * or null if that slot is not allocated. 436 */ 437 private final Slot slots[]; 438 439 /** 440 * A bitset where each bit represents a slot which is in use. 441 */ 442 private final BitSet allocatedSlots; 443 444 /** 445 * Create the ShortCircuitShm. 446 * 447 * @param shmId The ID to use. 448 * @param stream The stream that we're going to use to create this 449 * shared memory segment. 450 * 451 * Although this is a FileInputStream, we are going to 452 * assume that the underlying file descriptor is writable 453 * as well as readable. It would be more appropriate to use 454 * a RandomAccessFile here, but that class does not have 455 * any public accessor which returns a FileDescriptor, 456 * unlike FileInputStream. 457 */ 458 public ShortCircuitShm(ShmId shmId, FileInputStream stream) 459 throws IOException { 460 if (!NativeIO.isAvailable()) { 461 throw new UnsupportedOperationException("NativeIO is not available."); 462 } 463 if (Shell.WINDOWS) { 464 throw new UnsupportedOperationException( 465 "DfsClientShm is not yet implemented for Windows."); 466 } 467 if (unsafe == null) { 468 throw new UnsupportedOperationException( 469 "can't use DfsClientShm because we failed to " + 470 "load misc.Unsafe."); 471 } 472 this.shmId = shmId; 473 this.mmappedLength = getUsableLength(stream); 474 this.baseAddress = POSIX.mmap(stream.getFD(), 475 POSIX.MMAP_PROT_READ | POSIX.MMAP_PROT_WRITE, true, mmappedLength); 476 this.slots = new Slot[mmappedLength / BYTES_PER_SLOT]; 477 this.allocatedSlots = new BitSet(slots.length); 478 if (LOG.isTraceEnabled()) { 479 LOG.trace("creating " + this.getClass().getSimpleName() + 480 "(shmId=" + shmId + 481 ", mmappedLength=" + mmappedLength + 482 ", baseAddress=" + String.format("%x", baseAddress) + 483 ", slots.length=" + slots.length + ")"); 484 } 485 } 486 487 public final ShmId getShmId() { 488 return shmId; 489 } 490 491 /** 492 * Determine if this shared memory object is empty. 493 * 494 * @return True if the shared memory object is empty. 495 */ 496 synchronized final public boolean isEmpty() { 497 return allocatedSlots.nextSetBit(0) == -1; 498 } 499 500 /** 501 * Determine if this shared memory object is full. 502 * 503 * @return True if the shared memory object is full. 504 */ 505 synchronized final public boolean isFull() { 506 return allocatedSlots.nextClearBit(0) >= slots.length; 507 } 508 509 /** 510 * Calculate the base address of a slot. 511 * 512 * @param slotIdx Index of the slot. 513 * @return The base address of the slot. 514 */ 515 private final long calculateSlotAddress(int slotIdx) { 516 long offset = slotIdx; 517 offset *= BYTES_PER_SLOT; 518 return this.baseAddress + offset; 519 } 520 521 /** 522 * Allocate a new slot and register it. 523 * 524 * This function chooses an empty slot, initializes it, and then returns 525 * the relevant Slot object. 526 * 527 * @return The new slot. 528 */ 529 synchronized public final Slot allocAndRegisterSlot( 530 ExtendedBlockId blockId) { 531 int idx = allocatedSlots.nextClearBit(0); 532 if (idx >= slots.length) { 533 throw new RuntimeException(this + ": no more slots are available."); 534 } 535 allocatedSlots.set(idx, true); 536 Slot slot = new Slot(calculateSlotAddress(idx), blockId); 537 slot.makeValid(); 538 slots[idx] = slot; 539 if (LOG.isTraceEnabled()) { 540 LOG.trace(this + ": allocAndRegisterSlot " + idx + ": allocatedSlots=" + allocatedSlots + 541 StringUtils.getStackTrace(Thread.currentThread())); 542 } 543 return slot; 544 } 545 546 synchronized public final Slot getSlot(int slotIdx) 547 throws InvalidRequestException { 548 if (!allocatedSlots.get(slotIdx)) { 549 throw new InvalidRequestException(this + ": slot " + slotIdx + 550 " does not exist."); 551 } 552 return slots[slotIdx]; 553 } 554 555 /** 556 * Register a slot. 557 * 558 * This function looks at a slot which has already been initialized (by 559 * another process), and registers it with us. Then, it returns the 560 * relevant Slot object. 561 * 562 * @return The slot. 563 * 564 * @throws InvalidRequestException 565 * If the slot index we're trying to allocate has not been 566 * initialized, or is already in use. 567 */ 568 synchronized public final Slot registerSlot(int slotIdx, 569 ExtendedBlockId blockId) throws InvalidRequestException { 570 if (slotIdx < 0) { 571 throw new InvalidRequestException(this + ": invalid negative slot " + 572 "index " + slotIdx); 573 } 574 if (slotIdx >= slots.length) { 575 throw new InvalidRequestException(this + ": invalid slot " + 576 "index " + slotIdx); 577 } 578 if (allocatedSlots.get(slotIdx)) { 579 throw new InvalidRequestException(this + ": slot " + slotIdx + 580 " is already in use."); 581 } 582 Slot slot = new Slot(calculateSlotAddress(slotIdx), blockId); 583 if (!slot.isValid()) { 584 throw new InvalidRequestException(this + ": slot " + slotIdx + 585 " has not been allocated."); 586 } 587 slots[slotIdx] = slot; 588 allocatedSlots.set(slotIdx, true); 589 if (LOG.isTraceEnabled()) { 590 LOG.trace(this + ": registerSlot " + slotIdx + ": allocatedSlots=" + allocatedSlots + 591 StringUtils.getStackTrace(Thread.currentThread())); 592 } 593 return slot; 594 } 595 596 /** 597 * Unregisters a slot. 598 * 599 * This doesn't alter the contents of the slot. It just means 600 * 601 * @param slotIdx Index of the slot to unregister. 602 */ 603 synchronized public final void unregisterSlot(int slotIdx) { 604 Preconditions.checkState(allocatedSlots.get(slotIdx), 605 "tried to unregister slot " + slotIdx + ", which was not registered."); 606 allocatedSlots.set(slotIdx, false); 607 slots[slotIdx] = null; 608 if (LOG.isTraceEnabled()) { 609 LOG.trace(this + ": unregisterSlot " + slotIdx); 610 } 611 } 612 613 /** 614 * Iterate over all allocated slots. 615 * 616 * Note that this method isn't safe if 617 * 618 * @return The slot iterator. 619 */ 620 public SlotIterator slotIterator() { 621 return new SlotIterator(); 622 } 623 624 public void free() { 625 try { 626 POSIX.munmap(baseAddress, mmappedLength); 627 } catch (IOException e) { 628 LOG.warn(this + ": failed to munmap", e); 629 } 630 LOG.trace(this + ": freed"); 631 } 632 633 @Override 634 public String toString() { 635 return this.getClass().getSimpleName() + "(" + shmId + ")"; 636 } 637 }