001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.hdfs;
020    
021    import com.google.common.collect.Iterators;
022    import com.google.common.util.concurrent.UncheckedExecutionException;
023    import org.apache.hadoop.classification.InterfaceAudience;
024    import org.apache.hadoop.classification.InterfaceStability;
025    import org.apache.hadoop.hdfs.inotify.Event;
026    import org.apache.hadoop.hdfs.inotify.EventsList;
027    import org.apache.hadoop.hdfs.inotify.MissingEventsException;
028    import org.apache.hadoop.hdfs.protocol.ClientProtocol;
029    import org.apache.hadoop.util.Time;
030    import org.slf4j.Logger;
031    import org.slf4j.LoggerFactory;
032    
033    import java.io.IOException;
034    import java.util.Iterator;
035    import java.util.Random;
036    import java.util.concurrent.Callable;
037    import java.util.concurrent.ExecutionException;
038    import java.util.concurrent.ExecutorService;
039    import java.util.concurrent.Executors;
040    import java.util.concurrent.Future;
041    import java.util.concurrent.TimeUnit;
042    import java.util.concurrent.TimeoutException;
043    
044    /**
045     * Stream for reading inotify events. DFSInotifyEventInputStreams should not
046     * be shared among multiple threads.
047     */
048    @InterfaceAudience.Public
049    @InterfaceStability.Unstable
050    public class DFSInotifyEventInputStream {
051      public static Logger LOG = LoggerFactory.getLogger(DFSInotifyEventInputStream
052          .class);
053    
054      private final ClientProtocol namenode;
055      private Iterator<Event> it;
056      private long lastReadTxid;
057      /**
058       * The most recent txid the NameNode told us it has sync'ed -- helps us
059       * determine how far behind we are in the edit stream.
060       */
061      private long syncTxid;
062      /**
063       * Used to generate wait times in {@link DFSInotifyEventInputStream#take()}.
064       */
065      private Random rng = new Random();
066    
067      private static final int INITIAL_WAIT_MS = 10;
068    
069      DFSInotifyEventInputStream(ClientProtocol namenode) throws IOException {
070        this(namenode, namenode.getCurrentEditLogTxid()); // only consider new txn's
071      }
072    
073      DFSInotifyEventInputStream(ClientProtocol namenode, long lastReadTxid)
074          throws IOException {
075        this.namenode = namenode;
076        this.it = Iterators.emptyIterator();
077        this.lastReadTxid = lastReadTxid;
078      }
079    
080      /**
081       * Returns the next event in the stream or null if no new events are currently
082       * available.
083       *
084       * @throws IOException because of network error or edit log
085       * corruption. Also possible if JournalNodes are unresponsive in the
086       * QJM setting (even one unresponsive JournalNode is enough in rare cases),
087       * so catching this exception and retrying at least a few times is
088       * recommended.
089       * @throws MissingEventsException if we cannot return the next event in the
090       * stream because the data for the event (and possibly some subsequent events)
091       * has been deleted (generally because this stream is a very large number of
092       * events behind the current state of the NameNode). It is safe to continue
093       * reading from the stream after this exception is thrown -- the next
094       * available event will be returned.
095       */
096      public Event poll() throws IOException, MissingEventsException {
097        // need to keep retrying until the NN sends us the latest committed txid
098        if (lastReadTxid == -1) {
099          LOG.debug("poll(): lastReadTxid is -1, reading current txid from NN");
100          lastReadTxid = namenode.getCurrentEditLogTxid();
101          return null;
102        }
103        if (!it.hasNext()) {
104          EventsList el = namenode.getEditsFromTxid(lastReadTxid + 1);
105          if (el.getLastTxid() != -1) {
106            // we only want to set syncTxid when we were actually able to read some
107            // edits on the NN -- otherwise it will seem like edits are being
108            // generated faster than we can read them when the problem is really
109            // that we are temporarily unable to read edits
110            syncTxid = el.getSyncTxid();
111            it = el.getEvents().iterator();
112            long formerLastReadTxid = lastReadTxid;
113            lastReadTxid = el.getLastTxid();
114            if (el.getFirstTxid() != formerLastReadTxid + 1) {
115              throw new MissingEventsException(formerLastReadTxid + 1,
116                  el.getFirstTxid());
117            }
118          } else {
119            LOG.debug("poll(): read no edits from the NN when requesting edits " +
120              "after txid {}", lastReadTxid);
121            return null;
122          }
123        }
124    
125        if (it.hasNext()) { // can be empty if el.getLastTxid != -1 but none of the
126          // newly seen edit log ops actually got converted to events
127          return it.next();
128        } else {
129          return null;
130        }
131      }
132    
133      /**
134       * Return a estimate of how many events behind the NameNode's current state
135       * this stream is. Clients should periodically call this method and check if
136       * its result is steadily increasing, which indicates that they are falling
137       * behind (i.e. events are being generated faster than the client is reading
138       * them). If a client falls too far behind events may be deleted before the
139       * client can read them.
140       * <p/>
141       * A return value of -1 indicates that an estimate could not be produced, and
142       * should be ignored. The value returned by this method is really only useful
143       * when compared to previous or subsequent returned values.
144       */
145      public long getEventsBehindEstimate() {
146        if (syncTxid == 0) {
147          return -1;
148        } else {
149          assert syncTxid >= lastReadTxid;
150          // this gives the difference between the last txid we have fetched to the
151          // client and syncTxid at the time we last fetched events from the
152          // NameNode
153          return syncTxid - lastReadTxid;
154        }
155      }
156    
157      /**
158       * Returns the next event in the stream, waiting up to the specified amount of
159       * time for a new event. Returns null if a new event is not available at the
160       * end of the specified amount of time. The time before the method returns may
161       * exceed the specified amount of time by up to the time required for an RPC
162       * to the NameNode.
163       *
164       * @param time number of units of the given TimeUnit to wait
165       * @param tu the desired TimeUnit
166       * @throws IOException see {@link DFSInotifyEventInputStream#poll()}
167       * @throws MissingEventsException
168       * see {@link DFSInotifyEventInputStream#poll()}
169       * @throws InterruptedException if the calling thread is interrupted
170       */
171      public Event poll(long time, TimeUnit tu) throws IOException,
172          InterruptedException, MissingEventsException {
173        long initialTime = Time.monotonicNow();
174        long totalWait = TimeUnit.MILLISECONDS.convert(time, tu);
175        long nextWait = INITIAL_WAIT_MS;
176        Event next = null;
177        while ((next = poll()) == null) {
178          long timeLeft = totalWait - (Time.monotonicNow() - initialTime);
179          if (timeLeft <= 0) {
180            LOG.debug("timed poll(): timed out");
181            break;
182          } else if (timeLeft < nextWait * 2) {
183            nextWait = timeLeft;
184          } else {
185            nextWait *= 2;
186          }
187          LOG.debug("timed poll(): poll() returned null, sleeping for {} ms",
188              nextWait);
189          Thread.sleep(nextWait);
190        }
191    
192        return next;
193      }
194    
195      /**
196       * Returns the next event in the stream, waiting indefinitely if a new event
197       * is not immediately available.
198       *
199       * @throws IOException see {@link DFSInotifyEventInputStream#poll()}
200       * @throws MissingEventsException see
201       * {@link DFSInotifyEventInputStream#poll()}
202       * @throws InterruptedException if the calling thread is interrupted
203       */
204      public Event take() throws IOException, InterruptedException,
205          MissingEventsException {
206        Event next = null;
207        int nextWaitMin = INITIAL_WAIT_MS;
208        while ((next = poll()) == null) {
209          // sleep for a random period between nextWaitMin and nextWaitMin * 2
210          // to avoid stampedes at the NN if there are multiple clients
211          int sleepTime = nextWaitMin + rng.nextInt(nextWaitMin);
212          LOG.debug("take(): poll() returned null, sleeping for {} ms", sleepTime);
213          Thread.sleep(sleepTime);
214          // the maximum sleep is 2 minutes
215          nextWaitMin = Math.min(60000, nextWaitMin * 2);
216        }
217    
218        return next;
219      }
220    }