001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018 package org.apache.hadoop.fs;
019
020 import java.io.FileNotFoundException;
021 import java.io.IOException;
022 import java.io.UnsupportedEncodingException;
023 import java.net.URI;
024 import java.net.URISyntaxException;
025 import java.net.URLDecoder;
026 import java.util.ArrayList;
027 import java.util.List;
028 import java.util.Map;
029 import java.util.TreeMap;
030 import java.util.HashMap;
031 import java.util.concurrent.ConcurrentHashMap;
032
033 import org.apache.commons.logging.Log;
034 import org.apache.commons.logging.LogFactory;
035 import org.apache.hadoop.conf.Configuration;
036 import org.apache.hadoop.fs.permission.FsPermission;
037 import org.apache.hadoop.io.IOUtils;
038 import org.apache.hadoop.io.Text;
039 import org.apache.hadoop.util.LineReader;
040 import org.apache.hadoop.util.Progressable;
041
042 /**
043 * This is an implementation of the Hadoop Archive
044 * Filesystem. This archive Filesystem has index files
045 * of the form _index* and has contents of the form
046 * part-*. The index files store the indexes of the
047 * real files. The index files are of the form _masterindex
048 * and _index. The master index is a level of indirection
049 * in to the index file to make the look ups faster. the index
050 * file is sorted with hash code of the paths that it contains
051 * and the master index contains pointers to the positions in
052 * index for ranges of hashcodes.
053 */
054
055 public class HarFileSystem extends FilterFileSystem {
056
057 private static final Log LOG = LogFactory.getLog(HarFileSystem.class);
058
059 public static final int VERSION = 3;
060
061 private static final Map<URI, HarMetaData> harMetaCache =
062 new ConcurrentHashMap<URI, HarMetaData>();
063
064 // uri representation of this Har filesystem
065 private URI uri;
066 // the top level path of the archive
067 // in the underlying file system
068 private Path archivePath;
069 // the har auth
070 private String harAuth;
071
072 // pointer into the static metadata cache
073 private HarMetaData metadata;
074
075 /**
076 * public construction of harfilesystem
077 *
078 */
079 public HarFileSystem() {
080 }
081
082 /**
083 * Constructor to create a HarFileSystem with an
084 * underlying filesystem.
085 * @param fs
086 */
087 public HarFileSystem(FileSystem fs) {
088 super(fs);
089 }
090
091 /**
092 * Initialize a Har filesystem per har archive. The
093 * archive home directory is the top level directory
094 * in the filesystem that contains the HAR archive.
095 * Be careful with this method, you do not want to go
096 * on creating new Filesystem instances per call to
097 * path.getFileSystem().
098 * the uri of Har is
099 * har://underlyingfsscheme-host:port/archivepath.
100 * or
101 * har:///archivepath. This assumes the underlying filesystem
102 * to be used in case not specified.
103 */
104 public void initialize(URI name, Configuration conf) throws IOException {
105 // decode the name
106 URI underLyingURI = decodeHarURI(name, conf);
107 // we got the right har Path- now check if this is
108 // truly a har filesystem
109 Path harPath = archivePath(
110 new Path(name.getScheme(), name.getAuthority(), name.getPath()));
111 if (harPath == null) {
112 throw new IOException("Invalid path for the Har Filesystem. " +
113 name.toString());
114 }
115 if (fs == null) {
116 fs = FileSystem.get(underLyingURI, conf);
117 }
118 uri = harPath.toUri();
119 archivePath = new Path(uri.getPath());
120 harAuth = getHarAuth(underLyingURI);
121 //check for the underlying fs containing
122 // the index file
123 Path masterIndexPath = new Path(archivePath, "_masterindex");
124 Path archiveIndexPath = new Path(archivePath, "_index");
125 if (!fs.exists(masterIndexPath) || !fs.exists(archiveIndexPath)) {
126 throw new IOException("Invalid path for the Har Filesystem. " +
127 "No index file in " + harPath);
128 }
129
130 metadata = harMetaCache.get(uri);
131 if (metadata != null) {
132 FileStatus mStat = fs.getFileStatus(masterIndexPath);
133 FileStatus aStat = fs.getFileStatus(archiveIndexPath);
134 if (mStat.getModificationTime() != metadata.getMasterIndexTimestamp() ||
135 aStat.getModificationTime() != metadata.getArchiveIndexTimestamp()) {
136 // the archive has been overwritten since we last read it
137 // remove the entry from the meta data cache
138 metadata = null;
139 harMetaCache.remove(uri);
140 }
141 }
142 if (metadata == null) {
143 metadata = new HarMetaData(fs, masterIndexPath, archiveIndexPath);
144 metadata.parseMetaData();
145 harMetaCache.put(uri, metadata);
146 }
147 }
148
149 // get the version of the filesystem from the masterindex file
150 // the version is currently not useful since its the first version
151 // of archives
152 public int getHarVersion() throws IOException {
153 if (metadata != null) {
154 return metadata.getVersion();
155 }
156 else {
157 throw new IOException("Invalid meta data for the Har Filesystem");
158 }
159 }
160
161 /*
162 * find the parent path that is the
163 * archive path in the path. The last
164 * path segment that ends with .har is
165 * the path that will be returned.
166 */
167 private Path archivePath(Path p) {
168 Path retPath = null;
169 Path tmp = p;
170 for (int i=0; i< p.depth(); i++) {
171 if (tmp.toString().endsWith(".har")) {
172 retPath = tmp;
173 break;
174 }
175 tmp = tmp.getParent();
176 }
177 return retPath;
178 }
179
180 /**
181 * decode the raw URI to get the underlying URI
182 * @param rawURI raw Har URI
183 * @return filtered URI of the underlying fileSystem
184 */
185 private URI decodeHarURI(URI rawURI, Configuration conf) throws IOException {
186 String tmpAuth = rawURI.getAuthority();
187 //we are using the default file
188 //system in the config
189 //so create a underlying uri and
190 //return it
191 if (tmpAuth == null) {
192 //create a path
193 return FileSystem.getDefaultUri(conf);
194 }
195 String host = rawURI.getHost();
196 if (host == null) {
197 throw new IOException("URI: " + rawURI
198 + " is an invalid Har URI since host==null."
199 + " Expecting har://<scheme>-<host>/<path>.");
200 }
201 int i = host.indexOf('-');
202 if (i < 0) {
203 throw new IOException("URI: " + rawURI
204 + " is an invalid Har URI since '-' not found."
205 + " Expecting har://<scheme>-<host>/<path>.");
206 }
207 final String underLyingScheme = host.substring(0, i);
208 i++;
209 final String underLyingHost = i == host.length()? null: host.substring(i);
210 int underLyingPort = rawURI.getPort();
211 String auth = (underLyingHost == null && underLyingPort == -1)?
212 null:(underLyingHost+":"+underLyingPort);
213 URI tmp = null;
214 if (rawURI.getQuery() != null) {
215 // query component not allowed
216 throw new IOException("query component in Path not supported " + rawURI);
217 }
218 try {
219 tmp = new URI(underLyingScheme, auth, rawURI.getPath(),
220 rawURI.getQuery(), rawURI.getFragment());
221 } catch (URISyntaxException e) {
222 // do nothing should not happen
223 }
224 return tmp;
225 }
226
227 private static String decodeString(String str)
228 throws UnsupportedEncodingException {
229 return URLDecoder.decode(str, "UTF-8");
230 }
231
232 private String decodeFileName(String fname)
233 throws UnsupportedEncodingException {
234 int version = metadata.getVersion();
235 if (version == 2 || version == 3){
236 return decodeString(fname);
237 }
238 return fname;
239 }
240
241 /**
242 * return the top level archive.
243 */
244 public Path getWorkingDirectory() {
245 return new Path(uri.toString());
246 }
247
248 /**
249 * Create a har specific auth
250 * har-underlyingfs:port
251 * @param underLyingURI the uri of underlying
252 * filesystem
253 * @return har specific auth
254 */
255 private String getHarAuth(URI underLyingUri) {
256 String auth = underLyingUri.getScheme() + "-";
257 if (underLyingUri.getHost() != null) {
258 auth += underLyingUri.getHost() + ":";
259 if (underLyingUri.getPort() != -1) {
260 auth += underLyingUri.getPort();
261 }
262 }
263 else {
264 auth += ":";
265 }
266 return auth;
267 }
268
269 /**
270 * Returns the uri of this filesystem.
271 * The uri is of the form
272 * har://underlyingfsschema-host:port/pathintheunderlyingfs
273 */
274 @Override
275 public URI getUri() {
276 return this.uri;
277 }
278
279 /**
280 * this method returns the path
281 * inside the har filesystem.
282 * this is relative path inside
283 * the har filesystem.
284 * @param path the fully qualified path in the har filesystem.
285 * @return relative path in the filesystem.
286 */
287 private Path getPathInHar(Path path) {
288 Path harPath = new Path(path.toUri().getPath());
289 if (archivePath.compareTo(harPath) == 0)
290 return new Path(Path.SEPARATOR);
291 Path tmp = new Path(harPath.getName());
292 Path parent = harPath.getParent();
293 while (!(parent.compareTo(archivePath) == 0)) {
294 if (parent.toString().equals(Path.SEPARATOR)) {
295 tmp = null;
296 break;
297 }
298 tmp = new Path(parent.getName(), tmp);
299 parent = parent.getParent();
300 }
301 if (tmp != null)
302 tmp = new Path(Path.SEPARATOR, tmp);
303 return tmp;
304 }
305
306 //the relative path of p. basically
307 // getting rid of /. Parsing and doing
308 // string manipulation is not good - so
309 // just use the path api to do it.
310 private Path makeRelative(String initial, Path p) {
311 String scheme = this.uri.getScheme();
312 String authority = this.uri.getAuthority();
313 Path root = new Path(Path.SEPARATOR);
314 if (root.compareTo(p) == 0)
315 return new Path(scheme, authority, initial);
316 Path retPath = new Path(p.getName());
317 Path parent = p.getParent();
318 for (int i=0; i < p.depth()-1; i++) {
319 retPath = new Path(parent.getName(), retPath);
320 parent = parent.getParent();
321 }
322 return new Path(new Path(scheme, authority, initial),
323 retPath.toString());
324 }
325
326 /* this makes a path qualified in the har filesystem
327 * (non-Javadoc)
328 * @see org.apache.hadoop.fs.FilterFileSystem#makeQualified(
329 * org.apache.hadoop.fs.Path)
330 */
331 @Override
332 public Path makeQualified(Path path) {
333 // make sure that we just get the
334 // path component
335 Path fsPath = path;
336 if (!path.isAbsolute()) {
337 fsPath = new Path(archivePath, path);
338 }
339
340 URI tmpURI = fsPath.toUri();
341 //change this to Har uri
342 return new Path(uri.getScheme(), harAuth, tmpURI.getPath());
343 }
344
345 /**
346 * Fix offset and length of block locations.
347 * Note that this method modifies the original array.
348 * @param locations block locations of har part file
349 * @param start the start of the desired range in the contained file
350 * @param len the length of the desired range
351 * @param fileOffsetInHar the offset of the desired file in the har part file
352 * @return block locations with fixed offset and length
353 */
354 static BlockLocation[] fixBlockLocations(BlockLocation[] locations,
355 long start,
356 long len,
357 long fileOffsetInHar) {
358 // offset 1 past last byte of desired range
359 long end = start + len;
360
361 for (BlockLocation location : locations) {
362 // offset of part block relative to beginning of desired file
363 // (may be negative if file starts in this part block)
364 long harBlockStart = location.getOffset() - fileOffsetInHar;
365 // offset 1 past last byte of har block relative to beginning of
366 // desired file
367 long harBlockEnd = harBlockStart + location.getLength();
368
369 if (start > harBlockStart) {
370 // desired range starts after beginning of this har block
371 // fix offset to beginning of relevant range (relative to desired file)
372 location.setOffset(start);
373 // fix length to relevant portion of har block
374 location.setLength(location.getLength() - (start - harBlockStart));
375 } else {
376 // desired range includes beginning of this har block
377 location.setOffset(harBlockStart);
378 }
379
380 if (harBlockEnd > end) {
381 // range ends before end of this har block
382 // fix length to remove irrelevant portion at the end
383 location.setLength(location.getLength() - (harBlockEnd - end));
384 }
385 }
386
387 return locations;
388 }
389
390 /**
391 * Get block locations from the underlying fs and fix their
392 * offsets and lengths.
393 * @param file the input filestatus to get block locations
394 * @param start the start of the desired range in the contained file
395 * @param len the length of the desired range
396 * @return block locations for this segment of file
397 * @throws IOException
398 */
399 @Override
400 public BlockLocation[] getFileBlockLocations(FileStatus file, long start,
401 long len) throws IOException {
402 HarStatus hstatus = getFileHarStatus(file.getPath());
403 Path partPath = new Path(archivePath, hstatus.getPartName());
404 FileStatus partStatus = metadata.getPartFileStatus(partPath);
405
406 // get all part blocks that overlap with the desired file blocks
407 BlockLocation[] locations =
408 fs.getFileBlockLocations(partStatus,
409 hstatus.getStartIndex() + start, len);
410
411 return fixBlockLocations(locations, start, len, hstatus.getStartIndex());
412 }
413
414 /**
415 * the hash of the path p inside iniside
416 * the filesystem
417 * @param p the path in the harfilesystem
418 * @return the hash code of the path.
419 */
420 public static int getHarHash(Path p) {
421 return (p.toString().hashCode() & 0x7fffffff);
422 }
423
424 static class Store {
425 public Store() {
426 begin = end = startHash = endHash = 0;
427 }
428 public Store(long begin, long end, int startHash, int endHash) {
429 this.begin = begin;
430 this.end = end;
431 this.startHash = startHash;
432 this.endHash = endHash;
433 }
434 public long begin;
435 public long end;
436 public int startHash;
437 public int endHash;
438 }
439
440 /**
441 * Get filestatuses of all the children of a given directory. This just reads
442 * through index file and reads line by line to get all statuses for children
443 * of a directory. Its a brute force way of getting all such filestatuses
444 *
445 * @param parent
446 * the parent path directory
447 * @param statuses
448 * the list to add the children filestatuses to
449 * @param children
450 * the string list of children for this parent
451 * @param archiveIndexStat
452 * the archive index filestatus
453 */
454 private void fileStatusesInIndex(HarStatus parent, List<FileStatus> statuses,
455 List<String> children) throws IOException {
456 String parentString = parent.getName();
457 if (!parentString.endsWith(Path.SEPARATOR)){
458 parentString += Path.SEPARATOR;
459 }
460 Path harPath = new Path(parentString);
461 int harlen = harPath.depth();
462 final Map<String, FileStatus> cache = new TreeMap<String, FileStatus>();
463
464 for (HarStatus hstatus : metadata.archive.values()) {
465 String child = hstatus.getName();
466 if ((child.startsWith(parentString))) {
467 Path thisPath = new Path(child);
468 if (thisPath.depth() == harlen + 1) {
469 statuses.add(toFileStatus(hstatus, cache));
470 }
471 }
472 }
473 }
474
475 /**
476 * Combine the status stored in the index and the underlying status.
477 * @param h status stored in the index
478 * @param cache caching the underlying file statuses
479 * @return the combined file status
480 * @throws IOException
481 */
482 private FileStatus toFileStatus(HarStatus h,
483 Map<String, FileStatus> cache) throws IOException {
484 FileStatus underlying = null;
485 if (cache != null) {
486 underlying = cache.get(h.partName);
487 }
488 if (underlying == null) {
489 final Path p = h.isDir? archivePath: new Path(archivePath, h.partName);
490 underlying = fs.getFileStatus(p);
491 if (cache != null) {
492 cache.put(h.partName, underlying);
493 }
494 }
495
496 long modTime = 0;
497 int version = metadata.getVersion();
498 if (version < 3) {
499 modTime = underlying.getModificationTime();
500 } else if (version == 3) {
501 modTime = h.getModificationTime();
502 }
503
504 return new FileStatus(
505 h.isDir()? 0L: h.getLength(),
506 h.isDir(),
507 underlying.getReplication(),
508 underlying.getBlockSize(),
509 modTime,
510 underlying.getAccessTime(),
511 underlying.getPermission(),
512 underlying.getOwner(),
513 underlying.getGroup(),
514 makeRelative(this.uri.getPath(), new Path(h.name)));
515 }
516
517 // a single line parser for hadoop archives status
518 // stored in a single line in the index files
519 // the format is of the form
520 // filename "dir"/"file" partFileName startIndex length
521 // <space seperated children>
522 private class HarStatus {
523 boolean isDir;
524 String name;
525 List<String> children;
526 String partName;
527 long startIndex;
528 long length;
529 long modificationTime = 0;
530
531 public HarStatus(String harString) throws UnsupportedEncodingException {
532 String[] splits = harString.split(" ");
533 this.name = decodeFileName(splits[0]);
534 this.isDir = "dir".equals(splits[1]) ? true: false;
535 // this is equal to "none" if its a directory
536 this.partName = splits[2];
537 this.startIndex = Long.parseLong(splits[3]);
538 this.length = Long.parseLong(splits[4]);
539
540 int version = metadata.getVersion();
541 String[] propSplits = null;
542 // propSplits is used to retrieve the metainformation that Har versions
543 // 1 & 2 missed (modification time, permission, owner group).
544 // These fields are stored in an encoded string placed in different
545 // locations depending on whether it's a file or directory entry.
546 // If it's a directory, the string will be placed at the partName
547 // location (directories have no partName because they don't have data
548 // to be stored). This is done because the number of fields in a
549 // directory entry is unbounded (all children are listed at the end)
550 // If it's a file, the string will be the last field.
551 if (isDir) {
552 if (version == 3){
553 propSplits = decodeString(this.partName).split(" ");
554 }
555 children = new ArrayList<String>();
556 for (int i = 5; i < splits.length; i++) {
557 children.add(decodeFileName(splits[i]));
558 }
559 } else if (version == 3) {
560 propSplits = decodeString(splits[5]).split(" ");
561 }
562
563 if (propSplits != null && propSplits.length >= 4) {
564 modificationTime = Long.parseLong(propSplits[0]);
565 // the fields below are stored in the file but are currently not used
566 // by HarFileSystem
567 // permission = new FsPermission(Short.parseShort(propSplits[1]));
568 // owner = decodeString(propSplits[2]);
569 // group = decodeString(propSplits[3]);
570 }
571 }
572 public boolean isDir() {
573 return isDir;
574 }
575
576 public String getName() {
577 return name;
578 }
579 public String getPartName() {
580 return partName;
581 }
582 public long getStartIndex() {
583 return startIndex;
584 }
585 public long getLength() {
586 return length;
587 }
588 public long getModificationTime() {
589 return modificationTime;
590 }
591 }
592
593 /**
594 * return the filestatus of files in har archive.
595 * The permission returned are that of the archive
596 * index files. The permissions are not persisted
597 * while creating a hadoop archive.
598 * @param f the path in har filesystem
599 * @return filestatus.
600 * @throws IOException
601 */
602 @Override
603 public FileStatus getFileStatus(Path f) throws IOException {
604 HarStatus hstatus = getFileHarStatus(f);
605 return toFileStatus(hstatus, null);
606 }
607
608 private HarStatus getFileHarStatus(Path f) throws IOException {
609 // get the fs DataInputStream for the underlying file
610 // look up the index.
611 Path p = makeQualified(f);
612 Path harPath = getPathInHar(p);
613 if (harPath == null) {
614 throw new IOException("Invalid file name: " + f + " in " + uri);
615 }
616 HarStatus hstatus = metadata.archive.get(harPath);
617 if (hstatus == null) {
618 throw new FileNotFoundException("File: " + f + " does not exist in " + uri);
619 }
620 return hstatus;
621 }
622
623 /**
624 * @return null since no checksum algorithm is implemented.
625 */
626 public FileChecksum getFileChecksum(Path f) {
627 return null;
628 }
629
630 /**
631 * Returns a har input stream which fakes end of
632 * file. It reads the index files to get the part
633 * file name and the size and start of the file.
634 */
635 @Override
636 public FSDataInputStream open(Path f, int bufferSize) throws IOException {
637 // get the fs DataInputStream for the underlying file
638 HarStatus hstatus = getFileHarStatus(f);
639 // we got it.. woo hooo!!!
640 if (hstatus.isDir()) {
641 throw new FileNotFoundException(f + " : not a file in " +
642 archivePath);
643 }
644 return new HarFSDataInputStream(fs, new Path(archivePath,
645 hstatus.getPartName()),
646 hstatus.getStartIndex(), hstatus.getLength(), bufferSize);
647 }
648
649 public FSDataOutputStream create(Path f,
650 FsPermission permission,
651 boolean overwrite,
652 int bufferSize,
653 short replication,
654 long blockSize,
655 Progressable progress) throws IOException {
656 throw new IOException("Har: create not allowed.");
657 }
658
659 @Override
660 public void close() throws IOException {
661 if (fs != null) {
662 try {
663 fs.close();
664 } catch(IOException ie) {
665 //this might already be closed
666 // ignore
667 }
668 }
669 }
670
671 /**
672 * Not implemented.
673 */
674 @Override
675 public boolean setReplication(Path src, short replication) throws IOException{
676 throw new IOException("Har: setreplication not allowed");
677 }
678
679 /**
680 * Not implemented.
681 */
682 @Override
683 public boolean delete(Path f, boolean recursive) throws IOException {
684 throw new IOException("Har: delete not allowed");
685 }
686
687 /**
688 * liststatus returns the children of a directory
689 * after looking up the index files.
690 */
691 @Override
692 public FileStatus[] listStatus(Path f) throws IOException {
693 //need to see if the file is an index in file
694 //get the filestatus of the archive directory
695 // we will create fake filestatuses to return
696 // to the client
697 List<FileStatus> statuses = new ArrayList<FileStatus>();
698 Path tmpPath = makeQualified(f);
699 Path harPath = getPathInHar(tmpPath);
700 HarStatus hstatus = metadata.archive.get(harPath);
701 if (hstatus == null) {
702 throw new FileNotFoundException("File " + f + " not found in " + archivePath);
703 }
704 if (hstatus.isDir()) {
705 fileStatusesInIndex(hstatus, statuses, hstatus.children);
706 } else {
707 statuses.add(toFileStatus(hstatus, null));
708 }
709
710 return statuses.toArray(new FileStatus[statuses.size()]);
711 }
712
713 /**
714 * return the top level archive path.
715 */
716 public Path getHomeDirectory() {
717 return new Path(uri.toString());
718 }
719
720 public void setWorkingDirectory(Path newDir) {
721 //does nothing.
722 }
723
724 /**
725 * not implemented.
726 */
727 public boolean mkdirs(Path f, FsPermission permission) throws IOException {
728 throw new IOException("Har: mkdirs not allowed");
729 }
730
731 /**
732 * not implemented.
733 */
734 public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws
735 IOException {
736 throw new IOException("Har: copyfromlocalfile not allowed");
737 }
738
739 /**
740 * copies the file in the har filesystem to a local file.
741 */
742 public void copyToLocalFile(boolean delSrc, Path src, Path dst)
743 throws IOException {
744 FileUtil.copy(this, src, getLocal(getConf()), dst, false, getConf());
745 }
746
747 /**
748 * not implemented.
749 */
750 public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile)
751 throws IOException {
752 throw new IOException("Har: startLocalOutput not allowed");
753 }
754
755 /**
756 * not implemented.
757 */
758 public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile)
759 throws IOException {
760 throw new IOException("Har: completeLocalOutput not allowed");
761 }
762
763 /**
764 * not implemented.
765 */
766 public void setOwner(Path p, String username, String groupname)
767 throws IOException {
768 throw new IOException("Har: setowner not allowed");
769 }
770
771 /**
772 * Not implemented.
773 */
774 public void setPermission(Path p, FsPermission permisssion)
775 throws IOException {
776 throw new IOException("Har: setPermission not allowed");
777 }
778
779 /**
780 * Hadoop archives input stream. This input stream fakes EOF
781 * since archive files are part of bigger part files.
782 */
783 private static class HarFSDataInputStream extends FSDataInputStream {
784 /**
785 * Create an input stream that fakes all the reads/positions/seeking.
786 */
787 private static class HarFsInputStream extends FSInputStream {
788 private long position, start, end;
789 //The underlying data input stream that the
790 // underlying filesystem will return.
791 private FSDataInputStream underLyingStream;
792 //one byte buffer
793 private byte[] oneBytebuff = new byte[1];
794 HarFsInputStream(FileSystem fs, Path path, long start,
795 long length, int bufferSize) throws IOException {
796 underLyingStream = fs.open(path, bufferSize);
797 underLyingStream.seek(start);
798 // the start of this file in the part file
799 this.start = start;
800 // the position pointer in the part file
801 this.position = start;
802 // the end pointer in the part file
803 this.end = start + length;
804 }
805
806 public synchronized int available() throws IOException {
807 long remaining = end - underLyingStream.getPos();
808 if (remaining > (long)Integer.MAX_VALUE) {
809 return Integer.MAX_VALUE;
810 }
811 return (int) remaining;
812 }
813
814 public synchronized void close() throws IOException {
815 underLyingStream.close();
816 super.close();
817 }
818
819 //not implemented
820 @Override
821 public void mark(int readLimit) {
822 // do nothing
823 }
824
825 /**
826 * reset is not implemented
827 */
828 public void reset() throws IOException {
829 throw new IOException("reset not implemented.");
830 }
831
832 public synchronized int read() throws IOException {
833 int ret = read(oneBytebuff, 0, 1);
834 return (ret <= 0) ? -1: (oneBytebuff[0] & 0xff);
835 }
836
837 public synchronized int read(byte[] b) throws IOException {
838 int ret = read(b, 0, b.length);
839 if (ret != -1) {
840 position += ret;
841 }
842 return ret;
843 }
844
845 /**
846 *
847 */
848 public synchronized int read(byte[] b, int offset, int len)
849 throws IOException {
850 int newlen = len;
851 int ret = -1;
852 if (position + len > end) {
853 newlen = (int) (end - position);
854 }
855 // end case
856 if (newlen == 0)
857 return ret;
858 ret = underLyingStream.read(b, offset, newlen);
859 position += ret;
860 return ret;
861 }
862
863 public synchronized long skip(long n) throws IOException {
864 long tmpN = n;
865 if (tmpN > 0) {
866 if (position + tmpN > end) {
867 tmpN = end - position;
868 }
869 underLyingStream.seek(tmpN + position);
870 position += tmpN;
871 return tmpN;
872 }
873 return (tmpN < 0)? -1 : 0;
874 }
875
876 public synchronized long getPos() throws IOException {
877 return (position - start);
878 }
879
880 public synchronized void seek(long pos) throws IOException {
881 if (pos < 0 || (start + pos > end)) {
882 throw new IOException("Failed to seek: EOF");
883 }
884 position = start + pos;
885 underLyingStream.seek(position);
886 }
887
888 public boolean seekToNewSource(long targetPos) throws IOException {
889 //do not need to implement this
890 // hdfs in itself does seektonewsource
891 // while reading.
892 return false;
893 }
894
895 /**
896 * implementing position readable.
897 */
898 public int read(long pos, byte[] b, int offset, int length)
899 throws IOException {
900 int nlength = length;
901 if (start + nlength + pos > end) {
902 nlength = (int) (end - (start + pos));
903 }
904 return underLyingStream.read(pos + start , b, offset, nlength);
905 }
906
907 /**
908 * position readable again.
909 */
910 public void readFully(long pos, byte[] b, int offset, int length)
911 throws IOException {
912 if (start + length + pos > end) {
913 throw new IOException("Not enough bytes to read.");
914 }
915 underLyingStream.readFully(pos + start, b, offset, length);
916 }
917
918 public void readFully(long pos, byte[] b) throws IOException {
919 readFully(pos, b, 0, b.length);
920 }
921
922 }
923
924 /**
925 * constructors for har input stream.
926 * @param fs the underlying filesystem
927 * @param p The path in the underlying filesystem
928 * @param start the start position in the part file
929 * @param length the length of valid data in the part file
930 * @param bufsize the buffer size
931 * @throws IOException
932 */
933 public HarFSDataInputStream(FileSystem fs, Path p, long start,
934 long length, int bufsize) throws IOException {
935 super(new HarFsInputStream(fs, p, start, length, bufsize));
936 }
937
938 /**
939 * constructor for har input stream.
940 * @param fs the underlying filesystem
941 * @param p the path in the underlying file system
942 * @param start the start position in the part file
943 * @param length the length of valid data in the part file.
944 * @throws IOException
945 */
946 public HarFSDataInputStream(FileSystem fs, Path p, long start, long length)
947 throws IOException {
948 super(new HarFsInputStream(fs, p, start, length, 0));
949 }
950 }
951
952 private class HarMetaData {
953 private FileSystem fs;
954 private int version;
955 // the masterIndex of the archive
956 private Path masterIndexPath;
957 // the index file
958 private Path archiveIndexPath;
959
960 private long masterIndexTimestamp;
961 private long archiveIndexTimestamp;
962
963 List<Store> stores = new ArrayList<Store>();
964 Map<Path, HarStatus> archive = new HashMap<Path, HarStatus>();
965 private Map<Path, FileStatus> partFileStatuses = new HashMap<Path, FileStatus>();
966
967 public HarMetaData(FileSystem fs, Path masterIndexPath, Path archiveIndexPath) {
968 this.fs = fs;
969 this.masterIndexPath = masterIndexPath;
970 this.archiveIndexPath = archiveIndexPath;
971 }
972
973 public FileStatus getPartFileStatus(Path partPath) throws IOException {
974 FileStatus status;
975 status = partFileStatuses.get(partPath);
976 if (status == null) {
977 status = fs.getFileStatus(partPath);
978 partFileStatuses.put(partPath, status);
979 }
980 return status;
981 }
982
983 public long getMasterIndexTimestamp() {
984 return masterIndexTimestamp;
985 }
986
987 public long getArchiveIndexTimestamp() {
988 return archiveIndexTimestamp;
989 }
990
991 private int getVersion() {
992 return version;
993 }
994
995 private void parseMetaData() throws IOException {
996 Text line;
997 long read;
998 FSDataInputStream in = null;
999 LineReader lin = null;
1000
1001 try {
1002 in = fs.open(masterIndexPath);
1003 FileStatus masterStat = fs.getFileStatus(masterIndexPath);
1004 masterIndexTimestamp = masterStat.getModificationTime();
1005 lin = new LineReader(in, getConf());
1006 line = new Text();
1007 read = lin.readLine(line);
1008
1009 // the first line contains the version of the index file
1010 String versionLine = line.toString();
1011 String[] arr = versionLine.split(" ");
1012 version = Integer.parseInt(arr[0]);
1013 // make it always backwards-compatible
1014 if (this.version > HarFileSystem.VERSION) {
1015 throw new IOException("Invalid version " +
1016 this.version + " expected " + HarFileSystem.VERSION);
1017 }
1018
1019 // each line contains a hashcode range and the index file name
1020 String[] readStr = null;
1021 while(read < masterStat.getLen()) {
1022 int b = lin.readLine(line);
1023 read += b;
1024 readStr = line.toString().split(" ");
1025 int startHash = Integer.parseInt(readStr[0]);
1026 int endHash = Integer.parseInt(readStr[1]);
1027 stores.add(new Store(Long.parseLong(readStr[2]),
1028 Long.parseLong(readStr[3]), startHash,
1029 endHash));
1030 line.clear();
1031 }
1032 } finally {
1033 IOUtils.cleanup(LOG, lin, in);
1034 }
1035
1036 FSDataInputStream aIn = fs.open(archiveIndexPath);
1037 try {
1038 FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
1039 archiveIndexTimestamp = archiveStat.getModificationTime();
1040 LineReader aLin;
1041
1042 // now start reading the real index file
1043 for (Store s: stores) {
1044 read = 0;
1045 aIn.seek(s.begin);
1046 aLin = new LineReader(aIn, getConf());
1047 while (read + s.begin < s.end) {
1048 int tmp = aLin.readLine(line);
1049 read += tmp;
1050 String lineFeed = line.toString();
1051 String[] parsed = lineFeed.split(" ");
1052 parsed[0] = decodeFileName(parsed[0]);
1053 archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
1054 line.clear();
1055 }
1056 }
1057 } finally {
1058 IOUtils.cleanup(LOG, aIn);
1059 }
1060 }
1061 }
1062
1063 /*
1064 * testing purposes only:
1065 */
1066 HarMetaData getMetadata() {
1067 return metadata;
1068 }
1069 }