001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import java.io.File;
021import java.io.FileOutputStream;
022import java.io.IOException;
023import java.io.RandomAccessFile;
024
025import org.apache.hadoop.hdfs.protocol.Block;
026import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
027import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
028import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams;
029import org.apache.hadoop.io.IOUtils;
030import org.apache.hadoop.util.DataChecksum;
031import org.apache.hadoop.util.StringUtils;
032
033/** 
034 * This class defines a replica in a pipeline, which
035 * includes a persistent replica being written to by a dfs client or
036 * a temporary replica being replicated by a source datanode or
037 * being copied for the balancing purpose.
038 * 
039 * The base class implements a temporary replica
040 */
041public class ReplicaInPipeline extends ReplicaInfo
042                        implements ReplicaInPipelineInterface {
043  private long bytesAcked;
044  private long bytesOnDisk;
045  private byte[] lastChecksum;  
046  private Thread writer;
047
048  /**
049   * Bytes reserved for this replica on the containing volume.
050   * Based off difference between the estimated maximum block length and
051   * the bytes already written to this block.
052   */
053  private long bytesReserved;
054  
055  /**
056   * Constructor for a zero length replica
057   * @param blockId block id
058   * @param genStamp replica generation stamp
059   * @param vol volume where replica is located
060   * @param dir directory path where block and meta files are located
061   * @param bytesToReserve disk space to reserve for this replica, based on
062   *                       the estimated maximum block length.
063   */
064  public ReplicaInPipeline(long blockId, long genStamp, 
065        FsVolumeSpi vol, File dir, long bytesToReserve) {
066    this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve);
067  }
068
069  /**
070   * Constructor
071   * @param block a block
072   * @param vol volume where replica is located
073   * @param dir directory path where block and meta files are located
074   * @param writer a thread that is writing to this replica
075   */
076  ReplicaInPipeline(Block block, 
077      FsVolumeSpi vol, File dir, Thread writer) {
078    this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(),
079        vol, dir, writer, 0L);
080  }
081
082  /**
083   * Constructor
084   * @param blockId block id
085   * @param len replica length
086   * @param genStamp replica generation stamp
087   * @param vol volume where replica is located
088   * @param dir directory path where block and meta files are located
089   * @param writer a thread that is writing to this replica
090   * @param bytesToReserve disk space to reserve for this replica, based on
091   *                       the estimated maximum block length.
092   */
093  ReplicaInPipeline(long blockId, long len, long genStamp,
094      FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) {
095    super( blockId, len, genStamp, vol, dir);
096    this.bytesAcked = len;
097    this.bytesOnDisk = len;
098    this.writer = writer;
099    this.bytesReserved = bytesToReserve;
100  }
101
102  /**
103   * Copy constructor.
104   * @param from where to copy from
105   */
106  public ReplicaInPipeline(ReplicaInPipeline from) {
107    super(from);
108    this.bytesAcked = from.getBytesAcked();
109    this.bytesOnDisk = from.getBytesOnDisk();
110    this.writer = from.writer;
111    this.bytesReserved = from.bytesReserved;
112  }
113
114  @Override
115  public long getVisibleLength() {
116    return -1;
117  }
118  
119  @Override  //ReplicaInfo
120  public ReplicaState getState() {
121    return ReplicaState.TEMPORARY;
122  }
123  
124  @Override // ReplicaInPipelineInterface
125  public long getBytesAcked() {
126    return bytesAcked;
127  }
128  
129  @Override // ReplicaInPipelineInterface
130  public void setBytesAcked(long bytesAcked) {
131    long newBytesAcked = bytesAcked - this.bytesAcked;
132    this.bytesAcked = bytesAcked;
133
134    // Once bytes are ACK'ed we can release equivalent space from the
135    // volume's reservedForRbw count. We could have released it as soon
136    // as the write-to-disk completed but that would be inefficient.
137    getVolume().releaseReservedSpace(newBytesAcked);
138    bytesReserved -= newBytesAcked;
139  }
140  
141  @Override // ReplicaInPipelineInterface
142  public long getBytesOnDisk() {
143    return bytesOnDisk;
144  }
145
146  @Override
147  public long getBytesReserved() {
148    return bytesReserved;
149  }
150  
151  @Override
152  public void releaseAllBytesReserved() {  // ReplicaInPipelineInterface
153    getVolume().releaseReservedSpace(bytesReserved);
154    bytesReserved = 0;
155  }
156
157  @Override // ReplicaInPipelineInterface
158  public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) {
159    this.bytesOnDisk = dataLength;
160    this.lastChecksum = lastChecksum;
161  }
162  
163  @Override // ReplicaInPipelineInterface
164  public synchronized ChunkChecksum getLastChecksumAndDataLen() {
165    return new ChunkChecksum(getBytesOnDisk(), lastChecksum);
166  }
167
168  /**
169   * Set the thread that is writing to this replica
170   * @param writer a thread writing to this replica
171   */
172  public void setWriter(Thread writer) {
173    this.writer = writer;
174  }
175  
176  public void interruptThread() {
177    if (writer != null && writer != Thread.currentThread() 
178        && writer.isAlive()) {
179      this.writer.interrupt();
180    }
181  }
182
183  @Override  // Object
184  public boolean equals(Object o) {
185    return super.equals(o);
186  }
187  
188  /**
189   * Interrupt the writing thread and wait until it dies
190   * @throws IOException the waiting is interrupted
191   */
192  public void stopWriter(long xceiverStopTimeout) throws IOException {
193    if (writer != null && writer != Thread.currentThread() && writer.isAlive()) {
194      writer.interrupt();
195      try {
196        writer.join(xceiverStopTimeout);
197        if (writer.isAlive()) {
198          final String msg = "Join on writer thread " + writer + " timed out";
199          DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer));
200          throw new IOException(msg);
201        }
202      } catch (InterruptedException e) {
203        throw new IOException("Waiting for writer thread is interrupted.");
204      }
205    }
206  }
207  
208  @Override  // Object
209  public int hashCode() {
210    return super.hashCode();
211  }
212  
213  @Override // ReplicaInPipelineInterface
214  public ReplicaOutputStreams createStreams(boolean isCreate, 
215      DataChecksum requestedChecksum) throws IOException {
216    File blockFile = getBlockFile();
217    File metaFile = getMetaFile();
218    if (DataNode.LOG.isDebugEnabled()) {
219      DataNode.LOG.debug("writeTo blockfile is " + blockFile +
220                         " of size " + blockFile.length());
221      DataNode.LOG.debug("writeTo metafile is " + metaFile +
222                         " of size " + metaFile.length());
223    }
224    long blockDiskSize = 0L;
225    long crcDiskSize = 0L;
226    
227    // the checksum that should actually be used -- this
228    // may differ from requestedChecksum for appends.
229    final DataChecksum checksum;
230    
231    RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw");
232    
233    if (!isCreate) {
234      // For append or recovery, we must enforce the existing checksum.
235      // Also, verify that the file has correct lengths, etc.
236      boolean checkedMeta = false;
237      try {
238        BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF);
239        checksum = header.getChecksum();
240        
241        if (checksum.getBytesPerChecksum() !=
242            requestedChecksum.getBytesPerChecksum()) {
243          throw new IOException("Client requested checksum " +
244              requestedChecksum + " when appending to an existing block " +
245              "with different chunk size: " + checksum);
246        }
247        
248        int bytesPerChunk = checksum.getBytesPerChecksum();
249        int checksumSize = checksum.getChecksumSize();
250        
251        blockDiskSize = bytesOnDisk;
252        crcDiskSize = BlockMetadataHeader.getHeaderSize() +
253          (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize;
254        if (blockDiskSize>0 && 
255            (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) {
256          throw new IOException("Corrupted block: " + this);
257        }
258        checkedMeta = true;
259      } finally {
260        if (!checkedMeta) {
261          // clean up in case of exceptions.
262          IOUtils.closeStream(metaRAF);
263        }
264      }
265    } else {
266      // for create, we can use the requested checksum
267      checksum = requestedChecksum;
268    }
269    
270    FileOutputStream blockOut = null;
271    FileOutputStream crcOut = null;
272    try {
273      blockOut = new FileOutputStream(
274          new RandomAccessFile( blockFile, "rw" ).getFD() );
275      crcOut = new FileOutputStream(metaRAF.getFD() );
276      if (!isCreate) {
277        blockOut.getChannel().position(blockDiskSize);
278        crcOut.getChannel().position(crcDiskSize);
279      }
280      return new ReplicaOutputStreams(blockOut, crcOut, checksum,
281          getVolume().isTransientStorage());
282    } catch (IOException e) {
283      IOUtils.closeStream(blockOut);
284      IOUtils.closeStream(metaRAF);
285      throw e;
286    }
287  }
288  
289  @Override
290  public String toString() {
291    return super.toString()
292        + "\n  bytesAcked=" + bytesAcked
293        + "\n  bytesOnDisk=" + bytesOnDisk;
294  }
295}