001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs.client; 019 020 import com.google.common.annotations.VisibleForTesting; 021 import com.google.common.base.Preconditions; 022 023 import java.io.BufferedOutputStream; 024 import java.io.Closeable; 025 import java.io.DataOutputStream; 026 import java.io.EOFException; 027 import java.io.FileInputStream; 028 import java.io.IOException; 029 import java.util.HashMap; 030 import java.util.TreeMap; 031 import java.util.Map.Entry; 032 import java.util.concurrent.locks.Condition; 033 import java.util.concurrent.locks.ReentrantLock; 034 035 import org.apache.commons.lang.mutable.MutableBoolean; 036 import org.apache.commons.logging.Log; 037 import org.apache.commons.logging.LogFactory; 038 import org.apache.hadoop.hdfs.ExtendedBlockId; 039 import org.apache.hadoop.hdfs.ShortCircuitShm.ShmId; 040 import org.apache.hadoop.hdfs.ShortCircuitShm.Slot; 041 import org.apache.hadoop.hdfs.net.DomainPeer; 042 import org.apache.hadoop.hdfs.protocol.DatanodeInfo; 043 import org.apache.hadoop.hdfs.protocol.datatransfer.Sender; 044 import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.ShortCircuitShmResponseProto; 045 import org.apache.hadoop.hdfs.protocolPB.PBHelper; 046 import org.apache.hadoop.io.IOUtils; 047 import org.apache.hadoop.net.unix.DomainSocket; 048 import org.apache.hadoop.net.unix.DomainSocketWatcher; 049 import org.apache.hadoop.classification.InterfaceAudience; 050 051 /** 052 * Manages short-circuit memory segments for an HDFS client. 053 * 054 * Clients are responsible for requesting and releasing shared memory segments used 055 * for communicating with the DataNode. The client will try to allocate new slots 056 * in the set of existing segments, falling back to getting a new segment from the 057 * DataNode via {@link DataTransferProtocol#requestShortCircuitFds}. 058 * 059 * The counterpart to this class on the DataNode is {@link ShortCircuitRegistry}. 060 * See {@link ShortCircuitRegistry} for more information on the communication protocol. 061 */ 062 @InterfaceAudience.Private 063 public class DfsClientShmManager implements Closeable { 064 private static final Log LOG = LogFactory.getLog(DfsClientShmManager.class); 065 066 /** 067 * Manages short-circuit memory segments that pertain to a given DataNode. 068 */ 069 class EndpointShmManager { 070 /** 071 * The datanode we're managing. 072 */ 073 private final DatanodeInfo datanode; 074 075 /** 076 * Shared memory segments which have no empty slots. 077 * 078 * Protected by the manager lock. 079 */ 080 private final TreeMap<ShmId, DfsClientShm> full = 081 new TreeMap<ShmId, DfsClientShm>(); 082 083 /** 084 * Shared memory segments which have at least one empty slot. 085 * 086 * Protected by the manager lock. 087 */ 088 private final TreeMap<ShmId, DfsClientShm> notFull = 089 new TreeMap<ShmId, DfsClientShm>(); 090 091 /** 092 * True if this datanode doesn't support short-circuit shared memory 093 * segments. 094 * 095 * Protected by the manager lock. 096 */ 097 private boolean disabled = false; 098 099 /** 100 * True if we're in the process of loading a shared memory segment from 101 * this DataNode. 102 * 103 * Protected by the manager lock. 104 */ 105 private boolean loading = false; 106 107 EndpointShmManager (DatanodeInfo datanode) { 108 this.datanode = datanode; 109 } 110 111 /** 112 * Pull a slot out of a preexisting shared memory segment. 113 * 114 * Must be called with the manager lock held. 115 * 116 * @param blockId The blockId to put inside the Slot object. 117 * 118 * @return null if none of our shared memory segments contain a 119 * free slot; the slot object otherwise. 120 */ 121 private Slot allocSlotFromExistingShm(ExtendedBlockId blockId) { 122 if (notFull.isEmpty()) { 123 return null; 124 } 125 Entry<ShmId, DfsClientShm> entry = notFull.firstEntry(); 126 DfsClientShm shm = entry.getValue(); 127 ShmId shmId = shm.getShmId(); 128 Slot slot = shm.allocAndRegisterSlot(blockId); 129 if (shm.isFull()) { 130 if (LOG.isTraceEnabled()) { 131 LOG.trace(this + ": pulled the last slot " + slot.getSlotIdx() + 132 " out of " + shm); 133 } 134 DfsClientShm removedShm = notFull.remove(shmId); 135 Preconditions.checkState(removedShm == shm); 136 full.put(shmId, shm); 137 } else { 138 if (LOG.isTraceEnabled()) { 139 LOG.trace(this + ": pulled slot " + slot.getSlotIdx() + 140 " out of " + shm); 141 } 142 } 143 return slot; 144 } 145 146 /** 147 * Ask the DataNode for a new shared memory segment. This function must be 148 * called with the manager lock held. We will release the lock while 149 * communicating with the DataNode. 150 * 151 * @param clientName The current client name. 152 * @param peer The peer to use to talk to the DataNode. 153 * 154 * @return Null if the DataNode does not support shared memory 155 * segments, or experienced an error creating the 156 * shm. The shared memory segment itself on success. 157 * @throws IOException If there was an error communicating over the socket. 158 * We will not throw an IOException unless the socket 159 * itself (or the network) is the problem. 160 */ 161 private DfsClientShm requestNewShm(String clientName, DomainPeer peer) 162 throws IOException { 163 final DataOutputStream out = 164 new DataOutputStream( 165 new BufferedOutputStream(peer.getOutputStream())); 166 new Sender(out).requestShortCircuitShm(clientName); 167 ShortCircuitShmResponseProto resp = 168 ShortCircuitShmResponseProto.parseFrom( 169 PBHelper.vintPrefixed(peer.getInputStream())); 170 String error = resp.hasError() ? resp.getError() : "(unknown)"; 171 switch (resp.getStatus()) { 172 case SUCCESS: 173 DomainSocket sock = peer.getDomainSocket(); 174 byte buf[] = new byte[1]; 175 FileInputStream fis[] = new FileInputStream[1]; 176 if (sock.recvFileInputStreams(fis, buf, 0, buf.length) < 0) { 177 throw new EOFException("got EOF while trying to transfer the " + 178 "file descriptor for the shared memory segment."); 179 } 180 if (fis[0] == null) { 181 throw new IOException("the datanode " + datanode + " failed to " + 182 "pass a file descriptor for the shared memory segment."); 183 } 184 try { 185 DfsClientShm shm = 186 new DfsClientShm(PBHelper.convert(resp.getId()), 187 fis[0], this, peer); 188 if (LOG.isTraceEnabled()) { 189 LOG.trace(this + ": createNewShm: created " + shm); 190 } 191 return shm; 192 } finally { 193 IOUtils.cleanup(LOG, fis[0]); 194 } 195 case ERROR_UNSUPPORTED: 196 // The DataNode just does not support short-circuit shared memory 197 // access, and we should stop asking. 198 LOG.info(this + ": datanode does not support short-circuit " + 199 "shared memory access: " + error); 200 disabled = true; 201 return null; 202 default: 203 // The datanode experienced some kind of unexpected error when trying to 204 // create the short-circuit shared memory segment. 205 LOG.warn(this + ": error requesting short-circuit shared memory " + 206 "access: " + error); 207 return null; 208 } 209 } 210 211 /** 212 * Allocate a new shared memory slot connected to this datanode. 213 * 214 * Must be called with the EndpointShmManager lock held. 215 * 216 * @param peer The peer to use to talk to the DataNode. 217 * @param clientName The client name. 218 * @param usedPeer (out param) Will be set to true if we used the peer. 219 * When a peer is used 220 * 221 * @return null if the DataNode does not support shared memory 222 * segments, or experienced an error creating the 223 * shm. The shared memory segment itself on success. 224 * @throws IOException If there was an error communicating over the socket. 225 */ 226 Slot allocSlot(DomainPeer peer, MutableBoolean usedPeer, 227 String clientName, ExtendedBlockId blockId) throws IOException { 228 while (true) { 229 if (closed) { 230 if (LOG.isTraceEnabled()) { 231 LOG.trace(this + ": the DfsClientShmManager has been closed."); 232 } 233 return null; 234 } 235 if (disabled) { 236 if (LOG.isTraceEnabled()) { 237 LOG.trace(this + ": shared memory segment access is disabled."); 238 } 239 return null; 240 } 241 // Try to use an existing slot. 242 Slot slot = allocSlotFromExistingShm(blockId); 243 if (slot != null) { 244 return slot; 245 } 246 // There are no free slots. If someone is loading more slots, wait 247 // for that to finish. 248 if (loading) { 249 if (LOG.isTraceEnabled()) { 250 LOG.trace(this + ": waiting for loading to finish..."); 251 } 252 finishedLoading.awaitUninterruptibly(); 253 } else { 254 // Otherwise, load the slot ourselves. 255 loading = true; 256 lock.unlock(); 257 DfsClientShm shm; 258 try { 259 shm = requestNewShm(clientName, peer); 260 if (shm == null) continue; 261 // See #{DfsClientShmManager#domainSocketWatcher} for details 262 // about why we do this before retaking the manager lock. 263 domainSocketWatcher.add(peer.getDomainSocket(), shm); 264 // The DomainPeer is now our responsibility, and should not be 265 // closed by the caller. 266 usedPeer.setValue(true); 267 } finally { 268 lock.lock(); 269 loading = false; 270 finishedLoading.signalAll(); 271 } 272 if (shm.isStale()) { 273 // If the peer closed immediately after the shared memory segment 274 // was created, the DomainSocketWatcher callback might already have 275 // fired and marked the shm as stale. In this case, we obviously 276 // don't want to add the SharedMemorySegment to our list of valid 277 // not-full segments. 278 if (LOG.isDebugEnabled()) { 279 LOG.debug(this + ": the UNIX domain socket associated with " + 280 "this short-circuit memory closed before we could make " + 281 "use of the shm."); 282 } 283 } else { 284 notFull.put(shm.getShmId(), shm); 285 } 286 } 287 } 288 } 289 290 /** 291 * Stop tracking a slot. 292 * 293 * Must be called with the EndpointShmManager lock held. 294 * 295 * @param slot The slot to release. 296 */ 297 void freeSlot(Slot slot) { 298 DfsClientShm shm = (DfsClientShm)slot.getShm(); 299 shm.unregisterSlot(slot.getSlotIdx()); 300 if (shm.isStale()) { 301 // Stale shared memory segments should not be tracked here. 302 Preconditions.checkState(!full.containsKey(shm.getShmId())); 303 Preconditions.checkState(!notFull.containsKey(shm.getShmId())); 304 if (shm.isEmpty()) { 305 if (LOG.isTraceEnabled()) { 306 LOG.trace(this + ": freeing empty stale " + shm); 307 } 308 shm.free(); 309 } 310 } else { 311 ShmId shmId = shm.getShmId(); 312 full.remove(shmId); // The shm can't be full if we just freed a slot. 313 if (shm.isEmpty()) { 314 notFull.remove(shmId); 315 316 // If the shared memory segment is now empty, we call shutdown(2) on 317 // the UNIX domain socket associated with it. The DomainSocketWatcher, 318 // which is watching this socket, will call DfsClientShm#handle, 319 // cleaning up this shared memory segment. 320 // 321 // See #{DfsClientShmManager#domainSocketWatcher} for details about why 322 // we don't want to call DomainSocketWatcher#remove directly here. 323 // 324 // Note that we could experience 'fragmentation' here, where the 325 // DFSClient allocates a bunch of slots in different shared memory 326 // segments, and then frees most of them, but never fully empties out 327 // any segment. We make some attempt to avoid this fragmentation by 328 // always allocating new slots out of the shared memory segment with the 329 // lowest ID, but it could still occur. In most workloads, 330 // fragmentation should not be a major concern, since it doesn't impact 331 // peak file descriptor usage or the speed of allocation. 332 if (LOG.isTraceEnabled()) { 333 LOG.trace(this + ": shutting down UNIX domain socket for " + 334 "empty " + shm); 335 } 336 shutdown(shm); 337 } else { 338 notFull.put(shmId, shm); 339 } 340 } 341 } 342 343 /** 344 * Unregister a shared memory segment. 345 * 346 * Once a segment is unregistered, we will not allocate any more slots 347 * inside that segment. 348 * 349 * The DomainSocketWatcher calls this while holding the DomainSocketWatcher 350 * lock. 351 * 352 * @param shmId The ID of the shared memory segment to unregister. 353 */ 354 void unregisterShm(ShmId shmId) { 355 lock.lock(); 356 try { 357 full.remove(shmId); 358 notFull.remove(shmId); 359 } finally { 360 lock.unlock(); 361 } 362 } 363 364 @Override 365 public String toString() { 366 return String.format("EndpointShmManager(%s, parent=%s)", 367 datanode, DfsClientShmManager.this); 368 } 369 370 PerDatanodeVisitorInfo getVisitorInfo() { 371 return new PerDatanodeVisitorInfo(full, notFull, disabled); 372 } 373 374 final void shutdown(DfsClientShm shm) { 375 try { 376 shm.getPeer().getDomainSocket().shutdown(); 377 } catch (IOException e) { 378 LOG.warn(this + ": error shutting down shm: got IOException calling " + 379 "shutdown(SHUT_RDWR)", e); 380 } 381 } 382 } 383 384 private boolean closed = false; 385 386 private final ReentrantLock lock = new ReentrantLock(); 387 388 /** 389 * A condition variable which is signalled when we finish loading a segment 390 * from the Datanode. 391 */ 392 private final Condition finishedLoading = lock.newCondition(); 393 394 /** 395 * Information about each Datanode. 396 */ 397 private final HashMap<DatanodeInfo, EndpointShmManager> datanodes = 398 new HashMap<DatanodeInfo, EndpointShmManager>(1); 399 400 /** 401 * The DomainSocketWatcher which keeps track of the UNIX domain socket 402 * associated with each shared memory segment. 403 * 404 * Note: because the DomainSocketWatcher makes callbacks into this 405 * DfsClientShmManager object, you must MUST NOT attempt to take the 406 * DomainSocketWatcher lock while holding the DfsClientShmManager lock, 407 * or else deadlock might result. This means that most DomainSocketWatcher 408 * methods are off-limits unless you release the manager lock first. 409 */ 410 private final DomainSocketWatcher domainSocketWatcher; 411 412 DfsClientShmManager(int interruptCheckPeriodMs) throws IOException { 413 this.domainSocketWatcher = new DomainSocketWatcher(interruptCheckPeriodMs); 414 } 415 416 public Slot allocSlot(DatanodeInfo datanode, DomainPeer peer, 417 MutableBoolean usedPeer, ExtendedBlockId blockId, 418 String clientName) throws IOException { 419 lock.lock(); 420 try { 421 if (closed) { 422 LOG.trace(this + ": the DfsClientShmManager isclosed."); 423 return null; 424 } 425 EndpointShmManager shmManager = datanodes.get(datanode); 426 if (shmManager == null) { 427 shmManager = new EndpointShmManager(datanode); 428 datanodes.put(datanode, shmManager); 429 } 430 return shmManager.allocSlot(peer, usedPeer, clientName, blockId); 431 } finally { 432 lock.unlock(); 433 } 434 } 435 436 public void freeSlot(Slot slot) { 437 lock.lock(); 438 try { 439 DfsClientShm shm = (DfsClientShm)slot.getShm(); 440 shm.getEndpointShmManager().freeSlot(slot); 441 } finally { 442 lock.unlock(); 443 } 444 } 445 446 @VisibleForTesting 447 public static class PerDatanodeVisitorInfo { 448 public final TreeMap<ShmId, DfsClientShm> full; 449 public final TreeMap<ShmId, DfsClientShm> notFull; 450 public final boolean disabled; 451 452 PerDatanodeVisitorInfo(TreeMap<ShmId, DfsClientShm> full, 453 TreeMap<ShmId, DfsClientShm> notFull, boolean disabled) { 454 this.full = full; 455 this.notFull = notFull; 456 this.disabled = disabled; 457 } 458 } 459 460 @VisibleForTesting 461 public interface Visitor { 462 void visit(HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info) 463 throws IOException; 464 } 465 466 @VisibleForTesting 467 public void visit(Visitor visitor) throws IOException { 468 lock.lock(); 469 try { 470 HashMap<DatanodeInfo, PerDatanodeVisitorInfo> info = 471 new HashMap<DatanodeInfo, PerDatanodeVisitorInfo>(); 472 for (Entry<DatanodeInfo, EndpointShmManager> entry : 473 datanodes.entrySet()) { 474 info.put(entry.getKey(), entry.getValue().getVisitorInfo()); 475 } 476 visitor.visit(info); 477 } finally { 478 lock.unlock(); 479 } 480 } 481 482 /** 483 * Close the DfsClientShmManager. 484 */ 485 @Override 486 public void close() throws IOException { 487 lock.lock(); 488 try { 489 if (closed) return; 490 closed = true; 491 } finally { 492 lock.unlock(); 493 } 494 // When closed, the domainSocketWatcher will issue callbacks that mark 495 // all the outstanding DfsClientShm segments as stale. 496 IOUtils.cleanup(LOG, domainSocketWatcher); 497 } 498 499 500 @Override 501 public String toString() { 502 return String.format("ShortCircuitShmManager(%08x)", 503 System.identityHashCode(this)); 504 } 505 506 @VisibleForTesting 507 public DomainSocketWatcher getDomainSocketWatcher() { 508 return domainSocketWatcher; 509 } 510 }