001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.hdfs.util;
019    
020    import java.io.BufferedReader;
021    import java.io.File;
022    import java.io.FileInputStream;
023    import java.io.IOException;
024    import java.io.InputStream;
025    import java.io.InputStreamReader;
026    import java.security.DigestInputStream;
027    import java.security.MessageDigest;
028    import java.util.regex.Matcher;
029    import java.util.regex.Pattern;
030    
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    import org.apache.hadoop.io.IOUtils;
034    import org.apache.hadoop.io.MD5Hash;
035    import org.apache.hadoop.util.StringUtils;
036    
037    import com.google.common.base.Charsets;
038    
039    /**
040     * Static functions for dealing with files of the same format
041     * that the Unix "md5sum" utility writes.
042     */
043    public abstract class MD5FileUtils {
044      private static final Log LOG = LogFactory.getLog(
045          MD5FileUtils.class);
046    
047      public static final String MD5_SUFFIX = ".md5";
048      private static final Pattern LINE_REGEX =
049        Pattern.compile("([0-9a-f]{32}) [ \\*](.+)");
050      
051      /**
052       * Verify that the previously saved md5 for the given file matches
053       * expectedMd5.
054       * @throws IOException 
055       */
056      public static void verifySavedMD5(File dataFile, MD5Hash expectedMD5)
057          throws IOException {
058        MD5Hash storedHash = readStoredMd5ForFile(dataFile);
059        // Check the hash itself
060        if (!expectedMD5.equals(storedHash)) {
061          throw new IOException(
062              "File " + dataFile + " did not match stored MD5 checksum " +
063              " (stored: " + storedHash + ", computed: " + expectedMD5);
064        }
065      }
066      
067      /**
068       * Read the md5 checksum stored alongside the given file, or null
069       * if no md5 is stored.
070       * @param dataFile the file containing data
071       * @return the checksum stored in dataFile.md5
072       */
073      public static MD5Hash readStoredMd5ForFile(File dataFile) throws IOException {
074        File md5File = getDigestFileForFile(dataFile);
075    
076        String md5Line;
077        
078        if (!md5File.exists()) {
079          return null;
080        }
081        
082        BufferedReader reader =
083            new BufferedReader(new InputStreamReader(new FileInputStream(
084                md5File), Charsets.UTF_8));
085        try {
086          md5Line = reader.readLine();
087          if (md5Line == null) { md5Line = ""; }
088          md5Line = md5Line.trim();
089        } catch (IOException ioe) {
090          throw new IOException("Error reading md5 file at " + md5File, ioe);
091        } finally {
092          IOUtils.cleanup(LOG, reader);
093        }
094        
095        Matcher matcher = LINE_REGEX.matcher(md5Line);
096        if (!matcher.matches()) {
097          throw new IOException("Invalid MD5 file at " + md5File
098              + " (does not match expected pattern)");
099        }
100        String storedHash = matcher.group(1);
101        File referencedFile = new File(matcher.group(2));
102    
103        // Sanity check: Make sure that the file referenced in the .md5 file at
104        // least has the same name as the file we expect
105        if (!referencedFile.getName().equals(dataFile.getName())) {
106          throw new IOException(
107              "MD5 file at " + md5File + " references file named " +
108              referencedFile.getName() + " but we expected it to reference " +
109              dataFile);
110        }
111        return new MD5Hash(storedHash);
112      }
113      
114      /**
115       * Read dataFile and compute its MD5 checksum.
116       */
117      public static MD5Hash computeMd5ForFile(File dataFile) throws IOException {
118        InputStream in = new FileInputStream(dataFile);
119        try {
120          MessageDigest digester = MD5Hash.getDigester();
121          DigestInputStream dis = new DigestInputStream(in, digester);
122          IOUtils.copyBytes(dis, new IOUtils.NullOutputStream(), 128*1024);
123          
124          return new MD5Hash(digester.digest());
125        } finally {
126          IOUtils.closeStream(in);
127        }
128      }
129    
130      /**
131       * Save the ".md5" file that lists the md5sum of another file.
132       * @param dataFile the original file whose md5 was computed
133       * @param digest the computed digest
134       * @throws IOException
135       */
136      public static void saveMD5File(File dataFile, MD5Hash digest)
137          throws IOException {
138        File md5File = getDigestFileForFile(dataFile);
139        String digestString = StringUtils.byteToHexString(
140            digest.getDigest());
141        String md5Line = digestString + " *" + dataFile.getName() + "\n";
142        
143        AtomicFileOutputStream afos = new AtomicFileOutputStream(md5File);
144        afos.write(md5Line.getBytes(Charsets.UTF_8));
145        afos.close();
146        LOG.debug("Saved MD5 " + digest + " to " + md5File);
147      }
148    
149      /**
150       * @return a reference to the file with .md5 suffix that will
151       * contain the md5 checksum for the given data file.
152       */
153      public static File getDigestFileForFile(File file) {
154        return new File(file.getParentFile(), file.getName() + MD5_SUFFIX);
155      }
156    }