001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hdfs.server.datanode;
019
020import java.io.File;
021import java.io.FileOutputStream;
022import java.io.IOException;
023import java.io.RandomAccessFile;
024
025import org.apache.hadoop.hdfs.protocol.Block;
026import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
027import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
028import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams;
029import org.apache.hadoop.io.IOUtils;
030import org.apache.hadoop.util.DataChecksum;
031import org.apache.hadoop.util.StringUtils;
032
033/** 
034 * This class defines a replica in a pipeline, which
035 * includes a persistent replica being written to by a dfs client or
036 * a temporary replica being replicated by a source datanode or
037 * being copied for the balancing purpose.
038 * 
039 * The base class implements a temporary replica
040 */
041public class ReplicaInPipeline extends ReplicaInfo
042                        implements ReplicaInPipelineInterface {
043  private long bytesAcked;
044  private long bytesOnDisk;
045  private byte[] lastChecksum;  
046  private Thread writer;
047
048  /**
049   * Bytes reserved for this replica on the containing volume.
050   * Based off difference between the estimated maximum block length and
051   * the bytes already written to this block.
052   */
053  private long bytesReserved;
054  
055  /**
056   * Constructor for a zero length replica
057   * @param blockId block id
058   * @param genStamp replica generation stamp
059   * @param vol volume where replica is located
060   * @param dir directory path where block and meta files are located
061   * @param bytesToReserve disk space to reserve for this replica, based on
062   *                       the estimated maximum block length.
063   */
064  public ReplicaInPipeline(long blockId, long genStamp, 
065        FsVolumeSpi vol, File dir, long bytesToReserve) {
066    this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve);
067  }
068
069  /**
070   * Constructor
071   * @param block a block
072   * @param vol volume where replica is located
073   * @param dir directory path where block and meta files are located
074   * @param writer a thread that is writing to this replica
075   */
076  ReplicaInPipeline(Block block, 
077      FsVolumeSpi vol, File dir, Thread writer) {
078    this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(),
079        vol, dir, writer, 0L);
080  }
081
082  /**
083   * Constructor
084   * @param blockId block id
085   * @param len replica length
086   * @param genStamp replica generation stamp
087   * @param vol volume where replica is located
088   * @param dir directory path where block and meta files are located
089   * @param writer a thread that is writing to this replica
090   * @param bytesToReserve disk space to reserve for this replica, based on
091   *                       the estimated maximum block length.
092   */
093  ReplicaInPipeline(long blockId, long len, long genStamp,
094      FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) {
095    super( blockId, len, genStamp, vol, dir);
096    this.bytesAcked = len;
097    this.bytesOnDisk = len;
098    this.writer = writer;
099    this.bytesReserved = bytesToReserve;
100  }
101
102  /**
103   * Copy constructor.
104   * @param from where to copy from
105   */
106  public ReplicaInPipeline(ReplicaInPipeline from) {
107    super(from);
108    this.bytesAcked = from.getBytesAcked();
109    this.bytesOnDisk = from.getBytesOnDisk();
110    this.writer = from.writer;
111    this.bytesReserved = from.bytesReserved;
112  }
113
114  @Override
115  public long getVisibleLength() {
116    return -1;
117  }
118  
119  @Override  //ReplicaInfo
120  public ReplicaState getState() {
121    return ReplicaState.TEMPORARY;
122  }
123  
124  @Override // ReplicaInPipelineInterface
125  public long getBytesAcked() {
126    return bytesAcked;
127  }
128  
129  @Override // ReplicaInPipelineInterface
130  public void setBytesAcked(long bytesAcked) {
131    long newBytesAcked = bytesAcked - this.bytesAcked;
132    this.bytesAcked = bytesAcked;
133
134    // Once bytes are ACK'ed we can release equivalent space from the
135    // volume's reservedForRbw count. We could have released it as soon
136    // as the write-to-disk completed but that would be inefficient.
137    getVolume().releaseReservedSpace(newBytesAcked);
138    bytesReserved -= newBytesAcked;
139  }
140  
141  @Override // ReplicaInPipelineInterface
142  public long getBytesOnDisk() {
143    return bytesOnDisk;
144  }
145
146  @Override
147  public long getBytesReserved() {
148    return bytesReserved;
149  }
150  
151  @Override
152  public void releaseAllBytesReserved() {  // ReplicaInPipelineInterface
153    getVolume().releaseReservedSpace(bytesReserved);
154    bytesReserved = 0;
155  }
156
157  @Override // ReplicaInPipelineInterface
158  public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) {
159    this.bytesOnDisk = dataLength;
160    this.lastChecksum = lastChecksum;
161  }
162  
163  @Override // ReplicaInPipelineInterface
164  public synchronized ChunkChecksum getLastChecksumAndDataLen() {
165    return new ChunkChecksum(getBytesOnDisk(), lastChecksum);
166  }
167
168  /**
169   * Set the thread that is writing to this replica
170   * @param writer a thread writing to this replica
171   */
172  public void setWriter(Thread writer) {
173    this.writer = writer;
174  }
175  
176  @Override  // Object
177  public boolean equals(Object o) {
178    return super.equals(o);
179  }
180  
181  /**
182   * Interrupt the writing thread and wait until it dies
183   * @throws IOException the waiting is interrupted
184   */
185  public void stopWriter(long xceiverStopTimeout) throws IOException {
186    if (writer != null && writer != Thread.currentThread() && writer.isAlive()) {
187      writer.interrupt();
188      try {
189        writer.join(xceiverStopTimeout);
190        if (writer.isAlive()) {
191          final String msg = "Join on writer thread " + writer + " timed out";
192          DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(writer));
193          throw new IOException(msg);
194        }
195      } catch (InterruptedException e) {
196        throw new IOException("Waiting for writer thread is interrupted.");
197      }
198    }
199  }
200  
201  @Override  // Object
202  public int hashCode() {
203    return super.hashCode();
204  }
205  
206  @Override // ReplicaInPipelineInterface
207  public ReplicaOutputStreams createStreams(boolean isCreate, 
208      DataChecksum requestedChecksum) throws IOException {
209    File blockFile = getBlockFile();
210    File metaFile = getMetaFile();
211    if (DataNode.LOG.isDebugEnabled()) {
212      DataNode.LOG.debug("writeTo blockfile is " + blockFile +
213                         " of size " + blockFile.length());
214      DataNode.LOG.debug("writeTo metafile is " + metaFile +
215                         " of size " + metaFile.length());
216    }
217    long blockDiskSize = 0L;
218    long crcDiskSize = 0L;
219    
220    // the checksum that should actually be used -- this
221    // may differ from requestedChecksum for appends.
222    final DataChecksum checksum;
223    
224    RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw");
225    
226    if (!isCreate) {
227      // For append or recovery, we must enforce the existing checksum.
228      // Also, verify that the file has correct lengths, etc.
229      boolean checkedMeta = false;
230      try {
231        BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF);
232        checksum = header.getChecksum();
233        
234        if (checksum.getBytesPerChecksum() !=
235            requestedChecksum.getBytesPerChecksum()) {
236          throw new IOException("Client requested checksum " +
237              requestedChecksum + " when appending to an existing block " +
238              "with different chunk size: " + checksum);
239        }
240        
241        int bytesPerChunk = checksum.getBytesPerChecksum();
242        int checksumSize = checksum.getChecksumSize();
243        
244        blockDiskSize = bytesOnDisk;
245        crcDiskSize = BlockMetadataHeader.getHeaderSize() +
246          (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize;
247        if (blockDiskSize>0 && 
248            (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) {
249          throw new IOException("Corrupted block: " + this);
250        }
251        checkedMeta = true;
252      } finally {
253        if (!checkedMeta) {
254          // clean up in case of exceptions.
255          IOUtils.closeStream(metaRAF);
256        }
257      }
258    } else {
259      // for create, we can use the requested checksum
260      checksum = requestedChecksum;
261    }
262    
263    FileOutputStream blockOut = null;
264    FileOutputStream crcOut = null;
265    try {
266      blockOut = new FileOutputStream(
267          new RandomAccessFile( blockFile, "rw" ).getFD() );
268      crcOut = new FileOutputStream(metaRAF.getFD() );
269      if (!isCreate) {
270        blockOut.getChannel().position(blockDiskSize);
271        crcOut.getChannel().position(crcDiskSize);
272      }
273      return new ReplicaOutputStreams(blockOut, crcOut, checksum,
274          getVolume().isTransientStorage());
275    } catch (IOException e) {
276      IOUtils.closeStream(blockOut);
277      IOUtils.closeStream(metaRAF);
278      throw e;
279    }
280  }
281  
282  @Override
283  public String toString() {
284    return super.toString()
285        + "\n  bytesAcked=" + bytesAcked
286        + "\n  bytesOnDisk=" + bytesOnDisk;
287  }
288}