001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.hdfs.client; 019 020 import java.io.Closeable; 021 022 import org.apache.hadoop.classification.InterfaceAudience; 023 024 import java.io.FileInputStream; 025 import java.io.IOException; 026 import java.lang.ref.WeakReference; 027 import java.util.Iterator; 028 import java.util.TreeMap; 029 import java.util.Map.Entry; 030 import java.util.concurrent.ScheduledFuture; 031 import java.util.concurrent.ScheduledThreadPoolExecutor; 032 import java.util.concurrent.TimeUnit; 033 import java.util.concurrent.locks.Condition; 034 import java.util.concurrent.locks.Lock; 035 import java.util.concurrent.locks.ReentrantLock; 036 037 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE; 038 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT; 039 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS; 040 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT; 041 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT; 042 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT; 043 044 import org.apache.commons.logging.Log; 045 import org.apache.commons.logging.LogFactory; 046 import org.apache.hadoop.conf.Configuration; 047 import org.apache.hadoop.hdfs.protocol.DatanodeID; 048 import org.apache.hadoop.hdfs.protocol.ExtendedBlock; 049 import org.apache.hadoop.io.IOUtils; 050 051 import com.google.common.annotations.VisibleForTesting; 052 import com.google.common.collect.ComparisonChain; 053 import com.google.common.util.concurrent.ThreadFactoryBuilder; 054 055 /** 056 * Tracks mmap instances used on an HDFS client. 057 * 058 * mmaps can be used concurrently by multiple threads at once. 059 * mmaps cannot be closed while they are in use. 060 * 061 * The cache is important for performance, because the first time an mmap is 062 * created, the page table entries (PTEs) are not yet set up. 063 * Even when reading data that is entirely resident in memory, reading an 064 * mmap the second time is faster. 065 */ 066 @InterfaceAudience.Private 067 public class ClientMmapManager implements Closeable { 068 public static final Log LOG = LogFactory.getLog(ClientMmapManager.class); 069 070 private boolean closed = false; 071 072 private final int cacheSize; 073 074 private final long timeoutNs; 075 076 private final int runsPerTimeout; 077 078 private final Lock lock = new ReentrantLock(); 079 080 /** 081 * Maps block, datanode_id to the client mmap object. 082 * If the ClientMmap is in the process of being loaded, 083 * {@link Waitable<ClientMmap>#await()} will block. 084 * 085 * Protected by the ClientMmapManager lock. 086 */ 087 private final TreeMap<Key, Waitable<ClientMmap>> mmaps = 088 new TreeMap<Key, Waitable<ClientMmap>>(); 089 090 /** 091 * Maps the last use time to the client mmap object. 092 * We ensure that each last use time is unique by inserting a jitter of a 093 * nanosecond or two if necessary. 094 * 095 * Protected by the ClientMmapManager lock. 096 * ClientMmap objects that are in use are never evictable. 097 */ 098 private final TreeMap<Long, ClientMmap> evictable = 099 new TreeMap<Long, ClientMmap>(); 100 101 private final ScheduledThreadPoolExecutor executor = 102 new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder(). 103 setDaemon(true).setNameFormat("ClientMmapManager"). 104 build()); 105 106 /** 107 * The CacheCleaner for this ClientMmapManager. We don't create this 108 * and schedule it until it becomes necessary. 109 */ 110 private CacheCleaner cacheCleaner; 111 112 /** 113 * Factory method to create a ClientMmapManager from a Hadoop 114 * configuration. 115 */ 116 public static ClientMmapManager fromConf(Configuration conf) { 117 return new ClientMmapManager(conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE, 118 DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT), 119 conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, 120 DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT), 121 conf.getInt(DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT, 122 DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT)); 123 } 124 125 public ClientMmapManager(int cacheSize, long timeoutMs, int runsPerTimeout) { 126 this.cacheSize = cacheSize; 127 this.timeoutNs = timeoutMs * 1000000; 128 this.runsPerTimeout = runsPerTimeout; 129 } 130 131 long getTimeoutMs() { 132 return this.timeoutNs / 1000000; 133 } 134 135 int getRunsPerTimeout() { 136 return this.runsPerTimeout; 137 } 138 139 public String verifyConfigurationMatches(Configuration conf) { 140 StringBuilder bld = new StringBuilder(); 141 int cacheSize = conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE, 142 DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT); 143 if (this.cacheSize != cacheSize) { 144 bld.append("You specified a cache size of ").append(cacheSize). 145 append(", but the existing cache size is ").append(this.cacheSize). 146 append(". "); 147 } 148 long timeoutMs = conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS, 149 DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT); 150 if (getTimeoutMs() != timeoutMs) { 151 bld.append("You specified a cache timeout of ").append(timeoutMs). 152 append(" ms, but the existing cache timeout is "). 153 append(getTimeoutMs()).append("ms").append(". "); 154 } 155 int runsPerTimeout = conf.getInt( 156 DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT, 157 DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT); 158 if (getRunsPerTimeout() != runsPerTimeout) { 159 bld.append("You specified ").append(runsPerTimeout). 160 append(" runs per timeout, but the existing runs per timeout is "). 161 append(getTimeoutMs()).append(". "); 162 } 163 return bld.toString(); 164 } 165 166 private static class Waitable<T> { 167 private T val; 168 private final Condition cond; 169 170 public Waitable(Condition cond) { 171 this.val = null; 172 this.cond = cond; 173 } 174 175 public T await() throws InterruptedException { 176 while (this.val == null) { 177 this.cond.await(); 178 } 179 return this.val; 180 } 181 182 public void provide(T val) { 183 this.val = val; 184 this.cond.signalAll(); 185 } 186 } 187 188 private static class Key implements Comparable<Key> { 189 private final ExtendedBlock block; 190 private final DatanodeID datanode; 191 192 Key(ExtendedBlock block, DatanodeID datanode) { 193 this.block = block; 194 this.datanode = datanode; 195 } 196 197 /** 198 * Compare two ClientMmap regions that we're storing. 199 * 200 * When we append to a block, we bump the genstamp. It is important to 201 * compare the genStamp here. That way, we will not return a shorter 202 * mmap than required. 203 */ 204 @Override 205 public int compareTo(Key o) { 206 return ComparisonChain.start(). 207 compare(block.getBlockId(), o.block.getBlockId()). 208 compare(block.getGenerationStamp(), o.block.getGenerationStamp()). 209 compare(block.getBlockPoolId(), o.block.getBlockPoolId()). 210 compare(datanode, o.datanode). 211 result(); 212 } 213 214 @Override 215 public boolean equals(Object rhs) { 216 if (rhs == null) { 217 return false; 218 } 219 try { 220 Key o = (Key)rhs; 221 return (compareTo(o) == 0); 222 } catch (ClassCastException e) { 223 return false; 224 } 225 } 226 227 @Override 228 public int hashCode() { 229 return block.hashCode() ^ datanode.hashCode(); 230 } 231 } 232 233 /** 234 * Thread which handles expiring mmaps from the cache. 235 */ 236 private static class CacheCleaner implements Runnable, Closeable { 237 private WeakReference<ClientMmapManager> managerRef; 238 private ScheduledFuture<?> future; 239 240 CacheCleaner(ClientMmapManager manager) { 241 this.managerRef= new WeakReference<ClientMmapManager>(manager); 242 } 243 244 @Override 245 public void run() { 246 ClientMmapManager manager = managerRef.get(); 247 if (manager == null) return; 248 long curTime = System.nanoTime(); 249 try { 250 manager.lock.lock(); 251 manager.evictStaleEntries(curTime); 252 } finally { 253 manager.lock.unlock(); 254 } 255 } 256 257 void setFuture(ScheduledFuture<?> future) { 258 this.future = future; 259 } 260 261 @Override 262 public void close() throws IOException { 263 future.cancel(false); 264 } 265 } 266 267 /** 268 * Evict entries which are older than curTime + timeoutNs from the cache. 269 * 270 * NOTE: you must call this function with the lock held. 271 */ 272 private void evictStaleEntries(long curTime) { 273 if (closed) { 274 return; 275 } 276 Iterator<Entry<Long, ClientMmap>> iter = 277 evictable.entrySet().iterator(); 278 while (iter.hasNext()) { 279 Entry<Long, ClientMmap> entry = iter.next(); 280 if (entry.getKey() + timeoutNs >= curTime) { 281 return; 282 } 283 ClientMmap mmap = entry.getValue(); 284 Key key = new Key(mmap.getBlock(), mmap.getDatanodeID()); 285 mmaps.remove(key); 286 iter.remove(); 287 mmap.unmap(); 288 } 289 } 290 291 /** 292 * Evict one mmap object from the cache. 293 * 294 * NOTE: you must call this function with the lock held. 295 * 296 * @return True if an object was evicted; false if none 297 * could be evicted. 298 */ 299 private boolean evictOne() { 300 Entry<Long, ClientMmap> entry = evictable.pollFirstEntry(); 301 if (entry == null) { 302 // We don't want to try creating another mmap region, because the 303 // cache is full. 304 return false; 305 } 306 ClientMmap evictedMmap = entry.getValue(); 307 Key evictedKey = new Key(evictedMmap.getBlock(), 308 evictedMmap.getDatanodeID()); 309 mmaps.remove(evictedKey); 310 evictedMmap.unmap(); 311 return true; 312 } 313 314 /** 315 * Create a new mmap object. 316 * 317 * NOTE: you must call this function with the lock held. 318 * 319 * @param key The key which describes this mmap. 320 * @param in The input stream to use to create the mmap. 321 * @return The new mmap object, or null if there were 322 * insufficient resources. 323 * @throws IOException If there was an I/O error creating the mmap. 324 */ 325 private ClientMmap create(Key key, FileInputStream in) throws IOException { 326 if (mmaps.size() + 1 > cacheSize) { 327 if (!evictOne()) { 328 LOG.warn("mmap cache is full (with " + cacheSize + " elements) and " + 329 "nothing is evictable. Ignoring request for mmap with " + 330 "datanodeID=" + key.datanode + ", " + "block=" + key.block); 331 return null; 332 } 333 } 334 // Create the condition variable that other threads may wait on. 335 Waitable<ClientMmap> waitable = 336 new Waitable<ClientMmap>(lock.newCondition()); 337 mmaps.put(key, waitable); 338 // Load the entry 339 boolean success = false; 340 ClientMmap mmap = null; 341 try { 342 try { 343 lock.unlock(); 344 mmap = ClientMmap.load(this, in, key.block, key.datanode); 345 } finally { 346 lock.lock(); 347 } 348 if (cacheCleaner == null) { 349 cacheCleaner = new CacheCleaner(this); 350 ScheduledFuture<?> future = 351 executor.scheduleAtFixedRate(cacheCleaner, 352 timeoutNs, timeoutNs / runsPerTimeout, TimeUnit.NANOSECONDS); 353 cacheCleaner.setFuture(future); 354 } 355 success = true; 356 } finally { 357 if (!success) { 358 LOG.warn("failed to create mmap for datanodeID=" + key.datanode + 359 ", " + "block=" + key.block); 360 mmaps.remove(key); 361 } 362 waitable.provide(mmap); 363 } 364 if (LOG.isDebugEnabled()) { 365 LOG.info("created a new ClientMmap for block " + key.block + 366 " on datanode " + key.datanode); 367 } 368 return mmap; 369 } 370 371 /** 372 * Get or create an mmap region. 373 * 374 * @param node The DataNode that owns the block for this mmap region. 375 * @param block The block ID, block pool ID, and generation stamp of 376 * the block we want to read. 377 * @param in An open file for this block. This stream is only used 378 * if we have to create a new mmap; if we use an 379 * existing one, it is ignored. 380 * 381 * @return The client mmap region. 382 */ 383 public ClientMmap fetch(DatanodeID datanodeID, ExtendedBlock block, 384 FileInputStream in) throws IOException, InterruptedException { 385 LOG.debug("fetching mmap with datanodeID=" + datanodeID + ", " + 386 "block=" + block); 387 Key key = new Key(block, datanodeID); 388 ClientMmap mmap = null; 389 try { 390 lock.lock(); 391 if (closed) { 392 throw new IOException("ClientMmapManager is closed."); 393 } 394 while (mmap == null) { 395 Waitable<ClientMmap> entry = mmaps.get(key); 396 if (entry == null) { 397 return create(key, in); 398 } 399 mmap = entry.await(); 400 } 401 if (mmap.ref() == 1) { 402 // When going from nobody using the mmap (ref = 0) to somebody 403 // using the mmap (ref = 1), we must make the mmap un-evictable. 404 evictable.remove(mmap.getLastEvictableTimeNs()); 405 } 406 } 407 finally { 408 lock.unlock(); 409 } 410 if (LOG.isDebugEnabled()) { 411 LOG.debug("reusing existing mmap with datanodeID=" + datanodeID + 412 ", " + "block=" + block); 413 } 414 return mmap; 415 } 416 417 /** 418 * Make an mmap evictable. 419 * 420 * When an mmap is evictable, it may be removed from the cache if necessary. 421 * mmaps can only be evictable if nobody is using them. 422 * 423 * @param mmap The mmap to make evictable. 424 */ 425 void makeEvictable(ClientMmap mmap) { 426 try { 427 lock.lock(); 428 if (closed) { 429 // If this ClientMmapManager is closed, then don't bother with the 430 // cache; just close the mmap. 431 mmap.unmap(); 432 return; 433 } 434 long now = System.nanoTime(); 435 while (evictable.containsKey(now)) { 436 now++; 437 } 438 mmap.setLastEvictableTimeNs(now); 439 evictable.put(now, mmap); 440 } finally { 441 lock.unlock(); 442 } 443 } 444 445 @Override 446 public void close() throws IOException { 447 try { 448 lock.lock(); 449 closed = true; 450 IOUtils.cleanup(LOG, cacheCleaner); 451 452 // Unmap all the mmaps that nobody is using. 453 // The ones which are in use will be unmapped just as soon as people stop 454 // using them. 455 evictStaleEntries(Long.MAX_VALUE); 456 457 executor.shutdown(); 458 } finally { 459 lock.unlock(); 460 } 461 } 462 463 @VisibleForTesting 464 public interface ClientMmapVisitor { 465 void accept(ClientMmap mmap); 466 } 467 468 @VisibleForTesting 469 public synchronized void visitMmaps(ClientMmapVisitor visitor) 470 throws InterruptedException { 471 for (Waitable<ClientMmap> entry : mmaps.values()) { 472 visitor.accept(entry.await()); 473 } 474 } 475 476 public void visitEvictable(ClientMmapVisitor visitor) 477 throws InterruptedException { 478 for (ClientMmap mmap : evictable.values()) { 479 visitor.accept(mmap); 480 } 481 } 482 }