001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hdfs.server.datanode; 019 020import java.io.File; 021import java.io.FileOutputStream; 022import java.io.IOException; 023import java.io.RandomAccessFile; 024 025import org.apache.hadoop.hdfs.protocol.Block; 026import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; 027import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; 028import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; 029import org.apache.hadoop.io.IOUtils; 030import org.apache.hadoop.util.DataChecksum; 031import org.apache.hadoop.util.StringUtils; 032 033/** 034 * This class defines a replica in a pipeline, which 035 * includes a persistent replica being written to by a dfs client or 036 * a temporary replica being replicated by a source datanode or 037 * being copied for the balancing purpose. 038 * 039 * The base class implements a temporary replica 040 */ 041public class ReplicaInPipeline extends ReplicaInfo 042 implements ReplicaInPipelineInterface { 043 private long bytesAcked; 044 private long bytesOnDisk; 045 private byte[] lastChecksum; 046 private Thread writer; 047 048 /** 049 * Bytes reserved for this replica on the containing volume. 050 * Based off difference between the estimated maximum block length and 051 * the bytes already written to this block. 052 */ 053 private long bytesReserved; 054 055 /** 056 * Constructor for a zero length replica 057 * @param blockId block id 058 * @param genStamp replica generation stamp 059 * @param vol volume where replica is located 060 * @param dir directory path where block and meta files are located 061 * @param bytesToReserve disk space to reserve for this replica, based on 062 * the estimated maximum block length. 063 */ 064 public ReplicaInPipeline(long blockId, long genStamp, 065 FsVolumeSpi vol, File dir, long bytesToReserve) { 066 this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve); 067 } 068 069 /** 070 * Constructor 071 * @param block a block 072 * @param vol volume where replica is located 073 * @param dir directory path where block and meta files are located 074 * @param writer a thread that is writing to this replica 075 */ 076 ReplicaInPipeline(Block block, 077 FsVolumeSpi vol, File dir, Thread writer) { 078 this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(), 079 vol, dir, writer, 0L); 080 } 081 082 /** 083 * Constructor 084 * @param blockId block id 085 * @param len replica length 086 * @param genStamp replica generation stamp 087 * @param vol volume where replica is located 088 * @param dir directory path where block and meta files are located 089 * @param writer a thread that is writing to this replica 090 * @param bytesToReserve disk space to reserve for this replica, based on 091 * the estimated maximum block length. 092 */ 093 ReplicaInPipeline(long blockId, long len, long genStamp, 094 FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) { 095 super( blockId, len, genStamp, vol, dir); 096 this.bytesAcked = len; 097 this.bytesOnDisk = len; 098 this.writer = writer; 099 this.bytesReserved = bytesToReserve; 100 } 101 102 /** 103 * Copy constructor. 104 * @param from where to copy from 105 */ 106 public ReplicaInPipeline(ReplicaInPipeline from) { 107 super(from); 108 this.bytesAcked = from.getBytesAcked(); 109 this.bytesOnDisk = from.getBytesOnDisk(); 110 this.writer = from.writer; 111 this.bytesReserved = from.bytesReserved; 112 } 113 114 @Override 115 public long getVisibleLength() { 116 return -1; 117 } 118 119 @Override //ReplicaInfo 120 public ReplicaState getState() { 121 return ReplicaState.TEMPORARY; 122 } 123 124 @Override // ReplicaInPipelineInterface 125 public long getBytesAcked() { 126 return bytesAcked; 127 } 128 129 @Override // ReplicaInPipelineInterface 130 public void setBytesAcked(long bytesAcked) { 131 long newBytesAcked = bytesAcked - this.bytesAcked; 132 this.bytesAcked = bytesAcked; 133 134 // Once bytes are ACK'ed we can release equivalent space from the 135 // volume's reservedForRbw count. We could have released it as soon 136 // as the write-to-disk completed but that would be inefficient. 137 getVolume().releaseReservedSpace(newBytesAcked); 138 bytesReserved -= newBytesAcked; 139 } 140 141 @Override // ReplicaInPipelineInterface 142 public long getBytesOnDisk() { 143 return bytesOnDisk; 144 } 145 146 @Override 147 public long getBytesReserved() { 148 return bytesReserved; 149 } 150 151 @Override 152 public void releaseAllBytesReserved() { // ReplicaInPipelineInterface 153 getVolume().releaseReservedSpace(bytesReserved); 154 bytesReserved = 0; 155 } 156 157 @Override // ReplicaInPipelineInterface 158 public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) { 159 this.bytesOnDisk = dataLength; 160 this.lastChecksum = lastChecksum; 161 } 162 163 @Override // ReplicaInPipelineInterface 164 public synchronized ChunkChecksum getLastChecksumAndDataLen() { 165 return new ChunkChecksum(getBytesOnDisk(), lastChecksum); 166 } 167 168 /** 169 * Set the thread that is writing to this replica 170 * @param writer a thread writing to this replica 171 */ 172 public void setWriter(Thread writer) { 173 this.writer = writer; 174 } 175 176 @Override // Object 177 public boolean equals(Object o) { 178 return super.equals(o); 179 } 180 181 /** 182 * Interrupt the writing thread and wait until it dies 183 * @throws IOException the waiting is interrupted 184 */ 185 public void stopWriter(long xceiverStopTimeout) throws IOException { 186 if (writer != null && writer != Thread.currentThread() && writer.isAlive()) { 187 writer.interrupt(); 188 try { 189 writer.join(xceiverStopTimeout); 190 if (writer.isAlive()) { 191 final String msg = "Join on writer thread " + writer + " timed out"; 192 DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer)); 193 throw new IOException(msg); 194 } 195 } catch (InterruptedException e) { 196 throw new IOException("Waiting for writer thread is interrupted."); 197 } 198 } 199 } 200 201 @Override // Object 202 public int hashCode() { 203 return super.hashCode(); 204 } 205 206 @Override // ReplicaInPipelineInterface 207 public ReplicaOutputStreams createStreams(boolean isCreate, 208 DataChecksum requestedChecksum) throws IOException { 209 File blockFile = getBlockFile(); 210 File metaFile = getMetaFile(); 211 if (DataNode.LOG.isDebugEnabled()) { 212 DataNode.LOG.debug("writeTo blockfile is " + blockFile + 213 " of size " + blockFile.length()); 214 DataNode.LOG.debug("writeTo metafile is " + metaFile + 215 " of size " + metaFile.length()); 216 } 217 long blockDiskSize = 0L; 218 long crcDiskSize = 0L; 219 220 // the checksum that should actually be used -- this 221 // may differ from requestedChecksum for appends. 222 final DataChecksum checksum; 223 224 RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); 225 226 if (!isCreate) { 227 // For append or recovery, we must enforce the existing checksum. 228 // Also, verify that the file has correct lengths, etc. 229 boolean checkedMeta = false; 230 try { 231 BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF); 232 checksum = header.getChecksum(); 233 234 if (checksum.getBytesPerChecksum() != 235 requestedChecksum.getBytesPerChecksum()) { 236 throw new IOException("Client requested checksum " + 237 requestedChecksum + " when appending to an existing block " + 238 "with different chunk size: " + checksum); 239 } 240 241 int bytesPerChunk = checksum.getBytesPerChecksum(); 242 int checksumSize = checksum.getChecksumSize(); 243 244 blockDiskSize = bytesOnDisk; 245 crcDiskSize = BlockMetadataHeader.getHeaderSize() + 246 (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize; 247 if (blockDiskSize>0 && 248 (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) { 249 throw new IOException("Corrupted block: " + this); 250 } 251 checkedMeta = true; 252 } finally { 253 if (!checkedMeta) { 254 // clean up in case of exceptions. 255 IOUtils.closeStream(metaRAF); 256 } 257 } 258 } else { 259 // for create, we can use the requested checksum 260 checksum = requestedChecksum; 261 } 262 263 FileOutputStream blockOut = null; 264 FileOutputStream crcOut = null; 265 try { 266 blockOut = new FileOutputStream( 267 new RandomAccessFile( blockFile, "rw" ).getFD() ); 268 crcOut = new FileOutputStream(metaRAF.getFD() ); 269 if (!isCreate) { 270 blockOut.getChannel().position(blockDiskSize); 271 crcOut.getChannel().position(crcDiskSize); 272 } 273 return new ReplicaOutputStreams(blockOut, crcOut, checksum, 274 getVolume().isTransientStorage()); 275 } catch (IOException e) { 276 IOUtils.closeStream(blockOut); 277 IOUtils.closeStream(metaRAF); 278 throw e; 279 } 280 } 281 282 @Override 283 public String toString() { 284 return super.toString() 285 + "\n bytesAcked=" + bytesAcked 286 + "\n bytesOnDisk=" + bytesOnDisk; 287 } 288}