View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.digest;
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.nio.charset.StandardCharsets;
24  import java.nio.file.DirectoryStream;
25  import java.nio.file.Files;
26  import java.nio.file.Path;
27  import java.security.MessageDigest;
28  import java.util.HashMap;
29  import java.util.Map;
30  import java.util.Objects;
31  import java.util.Set;
32  import java.util.TreeSet;
33  import java.util.function.Supplier;
34  
35  /**
36   * Computes <a href="https://git-scm.com/">Git</a> object identifiers and their generalizations described by the
37   * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>.
38   *
39   * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git.
40   * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p>
41   *
42   * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p>
43   *
44   * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
45   * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a>
46   * @since 1.22.0
47   */
48  public class GitIdentifiers {
49  
50      /**
51       * Represents a single entry in a Git tree object.
52       *
53       * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
54       * <ul>
55       *   <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
56       *   <li>the entry name (file or directory name, without a path separator),</li>
57       *   <li>the raw object id of the referenced blob or sub-tree.</li>
58       * </ul>
59       *
60       * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
61       * sorts after {@code foobar}.</p>
62       *
63       * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
64       * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
65       */
66      static class DirectoryEntry implements Comparable<DirectoryEntry> {
67  
68          /**
69           * The entry name (file or directory name, no path separator).
70           */
71          private final String name;
72  
73          /**
74           * The raw object id of the referenced blob or sub-tree.
75           */
76          private final byte[] rawObjectId;
77  
78          /**
79           * The key used for ordering entries within a tree object.
80           *
81           * <p>>Git appends {@code '/'} to directory names before comparing.</p>
82           */
83          private final String sortKey;
84  
85          /**
86           * The Git object type, which determines the Unix file-mode prefix.
87           */
88          private final FileMode type;
89  
90          /**
91           * Constructs a new entry.
92           *
93           * @param name The name of the entry, not containing {@code '/'}.
94           * @param type The type of the entry, not null.
95           * @param rawObjectId The id of the entry, not null.
96           */
97          DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) {
98              if (Objects.requireNonNull(name).indexOf('/') >= 0) {
99                  throw new IllegalArgumentException("Entry name must not contain '/': " + name);
100             }
101             this.name = name;
102             this.type = Objects.requireNonNull(type);
103             this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
104             this.rawObjectId = Objects.requireNonNull(rawObjectId);
105         }
106 
107         @Override
108         public int compareTo(final DirectoryEntry o) {
109             return sortKey.compareTo(o.sortKey);
110         }
111 
112         @Override
113         public boolean equals(final Object obj) {
114             if (obj == this) {
115                 return true;
116             }
117             if (!(obj instanceof DirectoryEntry)) {
118                 return false;
119             }
120             final DirectoryEntry other = (DirectoryEntry) obj;
121             return name.equals(other.name);
122         }
123 
124         @Override
125         public int hashCode() {
126             return name.hashCode();
127         }
128 
129     }
130 
131     /**
132      * The type of a Git tree entry, which maps to a Unix file-mode string.
133      *
134      * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
135      * cover the four entry types that Git itself produces.</p>
136      *
137      * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
138      */
139     public enum FileMode {
140 
141         /**
142          * A subdirectory. Subdirectories can only be specified by SHA or through a tree mark set with {@code --import-marks}.
143          *
144          * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
145          */
146         DIRECTORY(new byte[] { '4', '0', '0', '0', '0' }),
147 
148         /**
149          * A regular, but executable, file.
150          */
151         EXECUTABLE(new byte[] { '1', '0', '0', '7', '5', '5' }),
152 
153         /**
154          * A gitlink, SHA-1 of the object refers to a commit in another repository. Git links can only be specified either by SHA or through a commit mark. They
155          * are used to implement submodules.
156          *
157          * @see <a href="https://git-scm.com/docs/gitdatamodel">gitdatamodel - Git&#39;s core data model</a>
158          * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
159          */
160         GIT_LINK(new byte[] { '1', '6', '0', '0', '0', '0' }),
161 
162         /**
163          * A regular (non-executable) file.
164          * <p>
165          * The majority of files in most projects use this mode. If in doubt, this is what you want.
166          * </p>
167          */
168         REGULAR(new byte[] { '1', '0', '0', '6', '4', '4' }),
169 
170         /**
171          * A symbolic link. The content of the file will be the link target.
172          */
173         SYMBOLIC_LINK(new byte[] { '1', '2', '0', '0', '0', '0' });
174 
175         private static FileMode get(final Path path) {
176             // Symbolic links first
177             if (Files.isSymbolicLink(path)) {
178                 return SYMBOLIC_LINK;
179             }
180             if (Files.isDirectory(path)) {
181                 return DIRECTORY;
182             }
183             if (Files.isExecutable(path)) {
184                 return EXECUTABLE;
185             }
186             return REGULAR;
187         }
188 
189         /**
190          * Serialized {@code mode}: since this is mutable, it must remain private.
191          */
192         private final byte[] modeBytes;
193 
194         FileMode(final byte[] modeBytes) {
195             // No need for a defensive copy since the array is private and never exposed,
196             this.modeBytes = modeBytes;
197         }
198     }
199 
200     /**
201      * Builds a Git tree identifier for a virtual directory structure, such as the contents of
202      * an archive.
203      */
204     public static final class TreeIdBuilder implements Supplier<byte[]> {
205 
206         /**
207          * Supplies a blob identifier that may throw {@link IOException}.
208          */
209         @FunctionalInterface
210         private interface BlobIdSupplier {
211             byte[] get() throws IOException;
212         }
213 
214         private static String requireNoParentTraversal(final String name) {
215             if ("..".equals(name)) {
216                 throw new IllegalArgumentException("Path component not allowed: " + name);
217             }
218             return name;
219         }
220 
221         private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
222         private final Map<String, DirectoryEntry> fileEntries = new HashMap<>();
223         private final MessageDigest messageDigest;
224 
225         private TreeIdBuilder(final MessageDigest messageDigest) {
226             this.messageDigest = Objects.requireNonNull(messageDigest);
227         }
228 
229         /**
230          * Adds and returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent.
231          *
232          * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}).
233          * @return The {@link TreeIdBuilder} for the subdirectory.
234          * @throws IllegalArgumentException If any path component is {@code ".."}.
235          */
236         public TreeIdBuilder addDirectory(final String name) {
237             TreeIdBuilder current = this;
238             for (final String component : name.split("/", -1)) {
239                 // Noop segments
240                 if (component.isEmpty() || ".".equals(component)) {
241                     continue;
242                 }
243                 current = current.dirEntries.computeIfAbsent(requireNoParentTraversal(component), k -> new TreeIdBuilder(messageDigest));
244             }
245             return current;
246         }
247 
248         private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException {
249             final int slash = name.lastIndexOf('/');
250             if (slash < 0) {
251                 fileEntries.put(name, new DirectoryEntry(requireNoParentTraversal(name), mode, blobId.get()));
252             } else {
253                 addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId);
254             }
255         }
256 
257         /**
258          * Adds a file entry at the given path within this tree.
259          *
260          * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
261          *
262          * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
263          * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
264          * @param data The file content.
265          * @throws IOException If an I/O error occurs.
266          * @throws IllegalArgumentException If any path component is {@code ".."}.
267          */
268         public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException {
269             addFile(mode, name, () -> blobId(messageDigest, data));
270         }
271 
272         /**
273          * Adds a file entry at the given path within this tree, streaming content without buffering.
274          *
275          * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
276          *
277          * <p>The stream is eagerly drained.</p>
278          *
279          * @param mode     The file mode (e.g. {@link FileMode#REGULAR}).
280          * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
281          * @param dataSize The exact number of bytes in {@code data}.
282          * @param data     The file content.
283          * @throws IOException If the stream cannot be read.
284          * @throws IllegalArgumentException If any path component is {@code ".."}.
285          */
286         public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException {
287             addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
288         }
289 
290         /**
291          * Adds a symbolic link entry at the given path within this tree.
292          *
293          * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
294          *
295          * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
296          * @param target The target of the symbolic link.
297          * @throws IOException If an I/O error occurs.
298          * @throws IllegalArgumentException If any path component is {@code ".."}.
299          */
300         public void addSymbolicLink(final String name, final String target) throws IOException {
301             addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8));
302         }
303 
304         /**
305          * Computes the Git tree identifier for this directory and all its descendants.
306          *
307          * @return The raw tree identifier bytes.
308          */
309         @Override
310         public byte[] get() {
311             final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values());
312             dirEntries.forEach((k, v) -> entries.add(new DirectoryEntry(k, FileMode.DIRECTORY, v.get())));
313             final ByteArrayOutputStream baos = new ByteArrayOutputStream();
314             for (final DirectoryEntry entry : entries) {
315                 baos.write(entry.type.modeBytes, 0, entry.type.modeBytes.length);
316                 baos.write(' ');
317                 final byte[] bytes = entry.name.getBytes(StandardCharsets.UTF_8);
318                 baos.write(bytes, 0, bytes.length);
319                 baos.write('\0');
320                 baos.write(entry.rawObjectId, 0, entry.rawObjectId.length);
321             }
322             messageDigest.reset();
323             DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size()));
324             return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest();
325         }
326 
327         private TreeIdBuilder populate(final Path directory) throws IOException {
328             try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) {
329                 for (final Path path : files) {
330                     final String name = Objects.toString(path.getFileName());
331                     final FileMode mode = FileMode.get(path);
332                     if (mode == FileMode.DIRECTORY) {
333                         addDirectory(name).populate(path);
334                     } else {
335                         addFile(mode, name, () -> blobId(messageDigest, path));
336                     }
337                 }
338             }
339             return this;
340         }
341     }
342 
343     /**
344      * Reads through a byte array and returns a generalized Git blob identifier.
345      *
346      * <p>The identifier is computed in the way described by the
347      * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
348      * algorithm.</p>
349      *
350      * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
351      *
352      * @param messageDigest The MessageDigest to use (for example SHA-1).
353      * @param data          Data to digest.
354      * @return A generalized Git blob identifier.
355      */
356     public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) {
357         messageDigest.reset();
358         DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
359         return DigestUtils.digest(messageDigest, data);
360     }
361 
362     /**
363      * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering.
364      *
365      * <p>When the size of the content is known in advance, this overload streams {@code data} directly through
366      * the digest without buffering the full content in memory.</p>
367      *
368      * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
369      *
370      * @param messageDigest The MessageDigest to use (for example SHA-1).
371      * @param dataSize      The exact number of bytes in {@code data}.
372      * @param data          Stream to digest.
373      * @return A generalized Git blob identifier.
374      * @throws IOException On error reading the stream.
375      */
376     public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException {
377         messageDigest.reset();
378         DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
379         return DigestUtils.updateDigest(messageDigest, data).digest();
380     }
381 
382     /**
383      * Reads through a file and returns a generalized Git blob identifier.
384      *
385      * <p>The identifier is computed in the way described by the
386      * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
387      * algorithm.</p>
388      *
389      * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
390      *
391      * @param messageDigest The MessageDigest to use (for example SHA-1).
392      * @param data          Path to the file to digest.
393      * @return A generalized Git blob identifier.
394      * @throws IOException On error accessing the file.
395      */
396     public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException {
397         if (Files.isSymbolicLink(data)) {
398             final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
399             return blobId(messageDigest, linkTarget);
400         }
401         messageDigest.reset();
402         DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data)));
403         return DigestUtils.updateDigest(messageDigest, data).digest();
404     }
405 
406     private static byte[] getGitBlobPrefix(final long dataSize) {
407         return getGitPrefix("blob", dataSize);
408     }
409 
410     private static byte[] getGitPrefix(final String type, final long dataSize) {
411         return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
412     }
413 
414     private static byte[] getGitTreePrefix(final long dataSize) {
415         return getGitPrefix("tree", dataSize);
416     }
417 
418     /**
419      * Reads through a directory and returns a generalized Git tree identifier.
420      *
421      * <p>The identifier is computed in the way described by the
422      * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
423      * algorithm.</p>
424      *
425      * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
426      *
427      * @param messageDigest The MessageDigest to use (for example SHA-1).
428      * @param data          Path to the directory to digest.
429      * @return A generalized Git tree identifier.
430      * @throws IOException On error accessing the directory or its contents.
431      */
432     public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException {
433         return treeIdBuilder(messageDigest).populate(data).get();
434     }
435 
436     /**
437      * Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory
438      * structure, such as the contents of an archive.
439      *
440      * <p>The identifier is computed in the way described by the
441      * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
442      * algorithm.</p>
443      *
444      * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
445      *
446      * @param messageDigest The MessageDigest to use (for example SHA-1).
447      * @return A new {@link TreeIdBuilder}.
448      */
449     public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) {
450         return new TreeIdBuilder(messageDigest);
451     }
452 
453     private GitIdentifiers() {
454         // utility class
455     }
456 }