001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode; 019 020import java.io.File; 021import java.io.FileOutputStream; 022import java.io.IOException; 023import java.io.RandomAccessFile; 024 025import org.apache.hadoop.hdfs.protocol.Block; 026import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; 027import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; 028import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; 029import org.apache.hadoop.io.IOUtils; 030import org.apache.hadoop.util.DataChecksum; 031import org.apache.hadoop.util.StringUtils; 032 033/** 034 * This class defines a replica in a pipeline, which 035 * includes a persistent replica being written to by a dfs client or 036 * a temporary replica being replicated by a source datanode or 037 * being copied for the balancing purpose. 038 * 039 * The base class implements a temporary replica 040 */ 041public class ReplicaInPipeline extends ReplicaInfo 042 implements ReplicaInPipelineInterface { 043 private long bytesAcked; 044 private long bytesOnDisk; 045 private byte[] lastChecksum; 046 private Thread writer; 047 048 /** 049 * Bytes reserved for this replica on the containing volume. 050 * Based off difference between the estimated maximum block length and 051 * the bytes already written to this block. 052 */ 053 private long bytesReserved; 054 055 /** 056 * Constructor for a zero length replica 057 * @param blockId block id 058 * @param genStamp replica generation stamp 059 * @param vol volume where replica is located 060 * @param dir directory path where block and meta files are located 061 * @param bytesToReserve disk space to reserve for this replica, based on 062 * the estimated maximum block length. 063 */ 064 public ReplicaInPipeline(long blockId, long genStamp, 065 FsVolumeSpi vol, File dir, long bytesToReserve) { 066 this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve); 067 } 068 069 /** 070 * Constructor 071 * @param block a block 072 * @param vol volume where replica is located 073 * @param dir directory path where block and meta files are located 074 * @param writer a thread that is writing to this replica 075 */ 076 ReplicaInPipeline(Block block, 077 FsVolumeSpi vol, File dir, Thread writer) { 078 this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(), 079 vol, dir, writer, 0L); 080 } 081 082 /** 083 * Constructor 084 * @param blockId block id 085 * @param len replica length 086 * @param genStamp replica generation stamp 087 * @param vol volume where replica is located 088 * @param dir directory path where block and meta files are located 089 * @param writer a thread that is writing to this replica 090 * @param bytesToReserve disk space to reserve for this replica, based on 091 * the estimated maximum block length. 092 */ 093 ReplicaInPipeline(long blockId, long len, long genStamp, 094 FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) { 095 super( blockId, len, genStamp, vol, dir); 096 this.bytesAcked = len; 097 this.bytesOnDisk = len; 098 this.writer = writer; 099 this.bytesReserved = bytesToReserve; 100 } 101 102 /** 103 * Copy constructor. 104 * @param from where to copy from 105 */ 106 public ReplicaInPipeline(ReplicaInPipeline from) { 107 super(from); 108 this.bytesAcked = from.getBytesAcked(); 109 this.bytesOnDisk = from.getBytesOnDisk(); 110 this.writer = from.writer; 111 this.bytesReserved = from.bytesReserved; 112 } 113 114 @Override 115 public long getVisibleLength() { 116 return -1; 117 } 118 119 @Override //ReplicaInfo 120 public ReplicaState getState() { 121 return ReplicaState.TEMPORARY; 122 } 123 124 @Override // ReplicaInPipelineInterface 125 public long getBytesAcked() { 126 return bytesAcked; 127 } 128 129 @Override // ReplicaInPipelineInterface 130 public void setBytesAcked(long bytesAcked) { 131 long newBytesAcked = bytesAcked - this.bytesAcked; 132 this.bytesAcked = bytesAcked; 133 134 // Once bytes are ACK'ed we can release equivalent space from the 135 // volume's reservedForRbw count. We could have released it as soon 136 // as the write-to-disk completed but that would be inefficient. 137 getVolume().releaseReservedSpace(newBytesAcked); 138 bytesReserved -= newBytesAcked; 139 } 140 141 @Override // ReplicaInPipelineInterface 142 public long getBytesOnDisk() { 143 return bytesOnDisk; 144 } 145 146 @Override 147 public long getBytesReserved() { 148 return bytesReserved; 149 } 150 151 @Override 152 public void releaseAllBytesReserved() { // ReplicaInPipelineInterface 153 getVolume().releaseReservedSpace(bytesReserved); 154 bytesReserved = 0; 155 } 156 157 @Override // ReplicaInPipelineInterface 158 public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) { 159 this.bytesOnDisk = dataLength; 160 this.lastChecksum = lastChecksum; 161 } 162 163 @Override // ReplicaInPipelineInterface 164 public synchronized ChunkChecksum getLastChecksumAndDataLen() { 165 return new ChunkChecksum(getBytesOnDisk(), lastChecksum); 166 } 167 168 /** 169 * Set the thread that is writing to this replica 170 * @param writer a thread writing to this replica 171 */ 172 public void setWriter(Thread writer) { 173 this.writer = writer; 174 } 175 176 public void interruptThread() { 177 if (writer != null && writer != Thread.currentThread() 178 && writer.isAlive()) { 179 this.writer.interrupt(); 180 } 181 } 182 183 @Override // Object 184 public boolean equals(Object o) { 185 return super.equals(o); 186 } 187 188 /** 189 * Interrupt the writing thread and wait until it dies 190 * @throws IOException the waiting is interrupted 191 */ 192 public void stopWriter(long xceiverStopTimeout) throws IOException { 193 if (writer != null && writer != Thread.currentThread() && writer.isAlive()) { 194 writer.interrupt(); 195 try { 196 writer.join(xceiverStopTimeout); 197 if (writer.isAlive()) { 198 final String msg = "Join on writer thread " + writer + " timed out"; 199 DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer)); 200 throw new IOException(msg); 201 } 202 } catch (InterruptedException e) { 203 throw new IOException("Waiting for writer thread is interrupted."); 204 } 205 } 206 } 207 208 @Override // Object 209 public int hashCode() { 210 return super.hashCode(); 211 } 212 213 @Override // ReplicaInPipelineInterface 214 public ReplicaOutputStreams createStreams(boolean isCreate, 215 DataChecksum requestedChecksum) throws IOException { 216 File blockFile = getBlockFile(); 217 File metaFile = getMetaFile(); 218 if (DataNode.LOG.isDebugEnabled()) { 219 DataNode.LOG.debug("writeTo blockfile is " + blockFile + 220 " of size " + blockFile.length()); 221 DataNode.LOG.debug("writeTo metafile is " + metaFile + 222 " of size " + metaFile.length()); 223 } 224 long blockDiskSize = 0L; 225 long crcDiskSize = 0L; 226 227 // the checksum that should actually be used -- this 228 // may differ from requestedChecksum for appends. 229 final DataChecksum checksum; 230 231 RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); 232 233 if (!isCreate) { 234 // For append or recovery, we must enforce the existing checksum. 235 // Also, verify that the file has correct lengths, etc. 236 boolean checkedMeta = false; 237 try { 238 BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF); 239 checksum = header.getChecksum(); 240 241 if (checksum.getBytesPerChecksum() != 242 requestedChecksum.getBytesPerChecksum()) { 243 throw new IOException("Client requested checksum " + 244 requestedChecksum + " when appending to an existing block " + 245 "with different chunk size: " + checksum); 246 } 247 248 int bytesPerChunk = checksum.getBytesPerChecksum(); 249 int checksumSize = checksum.getChecksumSize(); 250 251 blockDiskSize = bytesOnDisk; 252 crcDiskSize = BlockMetadataHeader.getHeaderSize() + 253 (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize; 254 if (blockDiskSize>0 && 255 (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) { 256 throw new IOException("Corrupted block: " + this); 257 } 258 checkedMeta = true; 259 } finally { 260 if (!checkedMeta) { 261 // clean up in case of exceptions. 262 IOUtils.closeStream(metaRAF); 263 } 264 } 265 } else { 266 // for create, we can use the requested checksum 267 checksum = requestedChecksum; 268 } 269 270 FileOutputStream blockOut = null; 271 FileOutputStream crcOut = null; 272 try { 273 blockOut = new FileOutputStream( 274 new RandomAccessFile( blockFile, "rw" ).getFD() ); 275 crcOut = new FileOutputStream(metaRAF.getFD() ); 276 if (!isCreate) { 277 blockOut.getChannel().position(blockDiskSize); 278 crcOut.getChannel().position(crcDiskSize); 279 } 280 return new ReplicaOutputStreams(blockOut, crcOut, checksum, 281 getVolume().isTransientStorage()); 282 } catch (IOException e) { 283 IOUtils.closeStream(blockOut); 284 IOUtils.closeStream(metaRAF); 285 throw e; 286 } 287 } 288 289 @Override 290 public String toString() { 291 return super.toString() 292 + "\n bytesAcked=" + bytesAcked 293 + "\n bytesOnDisk=" + bytesOnDisk; 294 } 295}