001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hdfs;
020
021import com.google.common.collect.Iterators;
022import org.apache.hadoop.classification.InterfaceAudience;
023import org.apache.hadoop.classification.InterfaceStability;
024import org.apache.hadoop.hdfs.inotify.EventBatch;
025import org.apache.hadoop.hdfs.inotify.EventBatchList;
026import org.apache.hadoop.hdfs.inotify.MissingEventsException;
027import org.apache.hadoop.hdfs.protocol.ClientProtocol;
028import org.apache.hadoop.util.Time;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import java.io.IOException;
033import java.util.Iterator;
034import java.util.Random;
035import java.util.concurrent.TimeUnit;
036
037/**
038 * Stream for reading inotify events. DFSInotifyEventInputStreams should not
039 * be shared among multiple threads.
040 */
041@InterfaceAudience.Public
042@InterfaceStability.Unstable
043public class DFSInotifyEventInputStream {
044  public static Logger LOG = LoggerFactory.getLogger(DFSInotifyEventInputStream
045      .class);
046
047  private final ClientProtocol namenode;
048  private Iterator<EventBatch> it;
049  private long lastReadTxid;
050  /**
051   * The most recent txid the NameNode told us it has sync'ed -- helps us
052   * determine how far behind we are in the edit stream.
053   */
054  private long syncTxid;
055  /**
056   * Used to generate wait times in {@link DFSInotifyEventInputStream#take()}.
057   */
058  private Random rng = new Random();
059
060  private static final int INITIAL_WAIT_MS = 10;
061
062  DFSInotifyEventInputStream(ClientProtocol namenode) throws IOException {
063    this(namenode, namenode.getCurrentEditLogTxid()); // only consider new txn's
064  }
065
066  DFSInotifyEventInputStream(ClientProtocol namenode, long lastReadTxid)
067      throws IOException {
068    this.namenode = namenode;
069    this.it = Iterators.emptyIterator();
070    this.lastReadTxid = lastReadTxid;
071  }
072
073  /**
074   * Returns the next batch of events in the stream or null if no new
075   * batches are currently available.
076   *
077   * @throws IOException because of network error or edit log
078   * corruption. Also possible if JournalNodes are unresponsive in the
079   * QJM setting (even one unresponsive JournalNode is enough in rare cases),
080   * so catching this exception and retrying at least a few times is
081   * recommended.
082   * @throws MissingEventsException if we cannot return the next batch in the
083   * stream because the data for the events (and possibly some subsequent
084   * events) has been deleted (generally because this stream is a very large
085   * number of transactions behind the current state of the NameNode). It is
086   * safe to continue reading from the stream after this exception is thrown
087   * The next available batch of events will be returned.
088   */
089  public EventBatch poll() throws IOException, MissingEventsException {
090    // need to keep retrying until the NN sends us the latest committed txid
091    if (lastReadTxid == -1) {
092      LOG.debug("poll(): lastReadTxid is -1, reading current txid from NN");
093      lastReadTxid = namenode.getCurrentEditLogTxid();
094      return null;
095    }
096    if (!it.hasNext()) {
097      EventBatchList el = namenode.getEditsFromTxid(lastReadTxid + 1);
098      if (el.getLastTxid() != -1) {
099        // we only want to set syncTxid when we were actually able to read some
100        // edits on the NN -- otherwise it will seem like edits are being
101        // generated faster than we can read them when the problem is really
102        // that we are temporarily unable to read edits
103        syncTxid = el.getSyncTxid();
104        it = el.getBatches().iterator();
105        long formerLastReadTxid = lastReadTxid;
106        lastReadTxid = el.getLastTxid();
107        if (el.getFirstTxid() != formerLastReadTxid + 1) {
108          throw new MissingEventsException(formerLastReadTxid + 1,
109              el.getFirstTxid());
110        }
111      } else {
112        LOG.debug("poll(): read no edits from the NN when requesting edits " +
113          "after txid {}", lastReadTxid);
114        return null;
115      }
116    }
117
118    if (it.hasNext()) { // can be empty if el.getLastTxid != -1 but none of the
119      // newly seen edit log ops actually got converted to events
120      return it.next();
121    } else {
122      return null;
123    }
124  }
125
126  /**
127   * Return a estimate of how many transaction IDs behind the NameNode's
128   * current state this stream is. Clients should periodically call this method
129   * and check if its result is steadily increasing, which indicates that they
130   * are falling behind (i.e. transaction are being generated faster than the
131   * client is reading them). If a client falls too far behind events may be
132   * deleted before the client can read them.
133   * <p/>
134   * A return value of -1 indicates that an estimate could not be produced, and
135   * should be ignored. The value returned by this method is really only useful
136   * when compared to previous or subsequent returned values.
137   */
138  public long getTxidsBehindEstimate() {
139    if (syncTxid == 0) {
140      return -1;
141    } else {
142      assert syncTxid >= lastReadTxid;
143      // this gives the difference between the last txid we have fetched to the
144      // client and syncTxid at the time we last fetched events from the
145      // NameNode
146      return syncTxid - lastReadTxid;
147    }
148  }
149
150  /**
151   * Returns the next event batch in the stream, waiting up to the specified
152   * amount of time for a new batch. Returns null if one is not available at the
153   * end of the specified amount of time. The time before the method returns may
154   * exceed the specified amount of time by up to the time required for an RPC
155   * to the NameNode.
156   *
157   * @param time number of units of the given TimeUnit to wait
158   * @param tu the desired TimeUnit
159   * @throws IOException see {@link DFSInotifyEventInputStream#poll()}
160   * @throws MissingEventsException
161   * see {@link DFSInotifyEventInputStream#poll()}
162   * @throws InterruptedException if the calling thread is interrupted
163   */
164  public EventBatch poll(long time, TimeUnit tu) throws IOException,
165      InterruptedException, MissingEventsException {
166    long initialTime = Time.monotonicNow();
167    long totalWait = TimeUnit.MILLISECONDS.convert(time, tu);
168    long nextWait = INITIAL_WAIT_MS;
169    EventBatch next = null;
170    while ((next = poll()) == null) {
171      long timeLeft = totalWait - (Time.monotonicNow() - initialTime);
172      if (timeLeft <= 0) {
173        LOG.debug("timed poll(): timed out");
174        break;
175      } else if (timeLeft < nextWait * 2) {
176        nextWait = timeLeft;
177      } else {
178        nextWait *= 2;
179      }
180      LOG.debug("timed poll(): poll() returned null, sleeping for {} ms",
181          nextWait);
182      Thread.sleep(nextWait);
183    }
184
185    return next;
186  }
187
188  /**
189   * Returns the next batch of events in the stream, waiting indefinitely if
190   * a new batch  is not immediately available.
191   *
192   * @throws IOException see {@link DFSInotifyEventInputStream#poll()}
193   * @throws MissingEventsException see
194   * {@link DFSInotifyEventInputStream#poll()}
195   * @throws InterruptedException if the calling thread is interrupted
196   */
197  public EventBatch take() throws IOException, InterruptedException,
198      MissingEventsException {
199    EventBatch next = null;
200    int nextWaitMin = INITIAL_WAIT_MS;
201    while ((next = poll()) == null) {
202      // sleep for a random period between nextWaitMin and nextWaitMin * 2
203      // to avoid stampedes at the NN if there are multiple clients
204      int sleepTime = nextWaitMin + rng.nextInt(nextWaitMin);
205      LOG.debug("take(): poll() returned null, sleeping for {} ms", sleepTime);
206      Thread.sleep(sleepTime);
207      // the maximum sleep is 2 minutes
208      nextWaitMin = Math.min(60000, nextWaitMin * 2);
209    }
210
211    return next;
212  }
213}