1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.codec.digest;
19
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.nio.charset.StandardCharsets;
24 import java.nio.file.DirectoryStream;
25 import java.nio.file.Files;
26 import java.nio.file.Path;
27 import java.security.MessageDigest;
28 import java.util.HashMap;
29 import java.util.Map;
30 import java.util.Objects;
31 import java.util.Set;
32 import java.util.TreeSet;
33 import java.util.function.Supplier;
34
35 /**
36 * Computes <a href="https://git-scm.com/">Git</a> object identifiers and their generalizations described by the
37 * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>.
38 *
39 * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git.
40 * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p>
41 *
42 * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p>
43 *
44 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
45 * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a>
46 * @since 1.22.0
47 */
48 public class GitIdentifiers {
49
50 /**
51 * Represents a single entry in a Git tree object.
52 *
53 * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
54 * <ul>
55 * <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
56 * <li>the entry name (file or directory name, without a path separator),</li>
57 * <li>the raw object id of the referenced blob or sub-tree.</li>
58 * </ul>
59 *
60 * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
61 * sorts after {@code foobar}.</p>
62 *
63 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
64 * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
65 */
66 static class DirectoryEntry implements Comparable<DirectoryEntry> {
67
68 /**
69 * The entry name (file or directory name, no path separator).
70 */
71 private final String name;
72
73 /**
74 * The raw object id of the referenced blob or sub-tree.
75 */
76 private final byte[] rawObjectId;
77
78 /**
79 * The key used for ordering entries within a tree object.
80 *
81 * <p>>Git appends {@code '/'} to directory names before comparing.</p>
82 */
83 private final String sortKey;
84
85 /**
86 * The Git object type, which determines the Unix file-mode prefix.
87 */
88 private final FileMode type;
89
90 /**
91 * Constructs a new entry.
92 *
93 * @param name The name of the entry, not containing {@code '/'}.
94 * @param type The type of the entry, not null.
95 * @param rawObjectId The id of the entry, not null.
96 */
97 DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) {
98 if (Objects.requireNonNull(name).indexOf('/') >= 0) {
99 throw new IllegalArgumentException("Entry name must not contain '/': " + name);
100 }
101 this.name = name;
102 this.type = Objects.requireNonNull(type);
103 this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name;
104 this.rawObjectId = Objects.requireNonNull(rawObjectId);
105 }
106
107 @Override
108 public int compareTo(final DirectoryEntry o) {
109 return sortKey.compareTo(o.sortKey);
110 }
111
112 @Override
113 public boolean equals(final Object obj) {
114 if (obj == this) {
115 return true;
116 }
117 if (!(obj instanceof DirectoryEntry)) {
118 return false;
119 }
120 final DirectoryEntry other = (DirectoryEntry) obj;
121 return name.equals(other.name);
122 }
123
124 @Override
125 public int hashCode() {
126 return name.hashCode();
127 }
128
129 }
130
131 /**
132 * The type of a Git tree entry, which maps to a Unix file-mode string.
133 *
134 * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
135 * cover the four entry types that Git itself produces.</p>
136 *
137 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
138 */
139 public enum FileMode {
140
141 /**
142 * A subdirectory. Subdirectories can only be specified by SHA or through a tree mark set with {@code --import-marks}.
143 *
144 * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
145 */
146 DIRECTORY(new byte[] { '4', '0', '0', '0', '0' }),
147
148 /**
149 * A regular, but executable, file.
150 */
151 EXECUTABLE(new byte[] { '1', '0', '0', '7', '5', '5' }),
152
153 /**
154 * A gitlink, SHA-1 of the object refers to a commit in another repository. Git links can only be specified either by SHA or through a commit mark. They
155 * are used to implement submodules.
156 *
157 * @see <a href="https://git-scm.com/docs/gitdatamodel">gitdatamodel - Git's core data model</a>
158 * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a>
159 */
160 GIT_LINK(new byte[] { '1', '6', '0', '0', '0', '0' }),
161
162 /**
163 * A regular (non-executable) file.
164 * <p>
165 * The majority of files in most projects use this mode. If in doubt, this is what you want.
166 * </p>
167 */
168 REGULAR(new byte[] { '1', '0', '0', '6', '4', '4' }),
169
170 /**
171 * A symbolic link. The content of the file will be the link target.
172 */
173 SYMBOLIC_LINK(new byte[] { '1', '2', '0', '0', '0', '0' });
174
175 private static FileMode get(final Path path) {
176 // Symbolic links first
177 if (Files.isSymbolicLink(path)) {
178 return SYMBOLIC_LINK;
179 }
180 if (Files.isDirectory(path)) {
181 return DIRECTORY;
182 }
183 if (Files.isExecutable(path)) {
184 return EXECUTABLE;
185 }
186 return REGULAR;
187 }
188
189 /**
190 * Serialized {@code mode}: since this is mutable, it must remain private.
191 */
192 private final byte[] modeBytes;
193
194 FileMode(final byte[] modeBytes) {
195 // No need for a defensive copy since the array is private and never exposed,
196 this.modeBytes = modeBytes;
197 }
198 }
199
200 /**
201 * Builds a Git tree identifier for a virtual directory structure, such as the contents of
202 * an archive.
203 */
204 public static final class TreeIdBuilder implements Supplier<byte[]> {
205
206 /**
207 * Supplies a blob identifier that may throw {@link IOException}.
208 */
209 @FunctionalInterface
210 private interface BlobIdSupplier {
211 byte[] get() throws IOException;
212 }
213
214 private static String requireNoParentTraversal(final String name) {
215 if ("..".equals(name)) {
216 throw new IllegalArgumentException("Path component not allowed: " + name);
217 }
218 return name;
219 }
220
221 private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>();
222 private final Map<String, DirectoryEntry> fileEntries = new HashMap<>();
223 private final MessageDigest messageDigest;
224
225 private TreeIdBuilder(final MessageDigest messageDigest) {
226 this.messageDigest = Objects.requireNonNull(messageDigest);
227 }
228
229 /**
230 * Adds and returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent.
231 *
232 * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}).
233 * @return The {@link TreeIdBuilder} for the subdirectory.
234 * @throws IllegalArgumentException If any path component is {@code ".."}.
235 */
236 public TreeIdBuilder addDirectory(final String name) {
237 TreeIdBuilder current = this;
238 for (final String component : name.split("/", -1)) {
239 // Noop segments
240 if (component.isEmpty() || ".".equals(component)) {
241 continue;
242 }
243 current = current.dirEntries.computeIfAbsent(requireNoParentTraversal(component), k -> new TreeIdBuilder(messageDigest));
244 }
245 return current;
246 }
247
248 private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException {
249 final int slash = name.lastIndexOf('/');
250 if (slash < 0) {
251 fileEntries.put(name, new DirectoryEntry(requireNoParentTraversal(name), mode, blobId.get()));
252 } else {
253 addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId);
254 }
255 }
256
257 /**
258 * Adds a file entry at the given path within this tree.
259 *
260 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
261 *
262 * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
263 * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
264 * @param data The file content.
265 * @throws IOException If an I/O error occurs.
266 * @throws IllegalArgumentException If any path component is {@code ".."}.
267 */
268 public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException {
269 addFile(mode, name, () -> blobId(messageDigest, data));
270 }
271
272 /**
273 * Adds a file entry at the given path within this tree, streaming content without buffering.
274 *
275 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
276 *
277 * <p>The stream is eagerly drained.</p>
278 *
279 * @param mode The file mode (e.g. {@link FileMode#REGULAR}).
280 * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
281 * @param dataSize The exact number of bytes in {@code data}.
282 * @param data The file content.
283 * @throws IOException If the stream cannot be read.
284 * @throws IllegalArgumentException If any path component is {@code ".."}.
285 */
286 public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException {
287 addFile(mode, name, () -> blobId(messageDigest, dataSize, data));
288 }
289
290 /**
291 * Adds a symbolic link entry at the given path within this tree.
292 *
293 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p>
294 *
295 * @param name The relative path of the entry in normalized form(may contain {@code '/'}).
296 * @param target The target of the symbolic link.
297 * @throws IOException If an I/O error occurs.
298 * @throws IllegalArgumentException If any path component is {@code ".."}.
299 */
300 public void addSymbolicLink(final String name, final String target) throws IOException {
301 addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8));
302 }
303
304 /**
305 * Computes the Git tree identifier for this directory and all its descendants.
306 *
307 * @return The raw tree identifier bytes.
308 */
309 @Override
310 public byte[] get() {
311 final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values());
312 dirEntries.forEach((k, v) -> entries.add(new DirectoryEntry(k, FileMode.DIRECTORY, v.get())));
313 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
314 for (final DirectoryEntry entry : entries) {
315 baos.write(entry.type.modeBytes, 0, entry.type.modeBytes.length);
316 baos.write(' ');
317 final byte[] bytes = entry.name.getBytes(StandardCharsets.UTF_8);
318 baos.write(bytes, 0, bytes.length);
319 baos.write('\0');
320 baos.write(entry.rawObjectId, 0, entry.rawObjectId.length);
321 }
322 messageDigest.reset();
323 DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size()));
324 return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest();
325 }
326
327 private TreeIdBuilder populate(final Path directory) throws IOException {
328 try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) {
329 for (final Path path : files) {
330 final String name = Objects.toString(path.getFileName());
331 final FileMode mode = FileMode.get(path);
332 if (mode == FileMode.DIRECTORY) {
333 addDirectory(name).populate(path);
334 } else {
335 addFile(mode, name, () -> blobId(messageDigest, path));
336 }
337 }
338 }
339 return this;
340 }
341 }
342
343 /**
344 * Reads through a byte array and returns a generalized Git blob identifier.
345 *
346 * <p>The identifier is computed in the way described by the
347 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
348 * algorithm.</p>
349 *
350 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
351 *
352 * @param messageDigest The MessageDigest to use (for example SHA-1).
353 * @param data Data to digest.
354 * @return A generalized Git blob identifier.
355 */
356 public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) {
357 messageDigest.reset();
358 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length));
359 return DigestUtils.digest(messageDigest, data);
360 }
361
362 /**
363 * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering.
364 *
365 * <p>When the size of the content is known in advance, this overload streams {@code data} directly through
366 * the digest without buffering the full content in memory.</p>
367 *
368 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
369 *
370 * @param messageDigest The MessageDigest to use (for example SHA-1).
371 * @param dataSize The exact number of bytes in {@code data}.
372 * @param data Stream to digest.
373 * @return A generalized Git blob identifier.
374 * @throws IOException On error reading the stream.
375 */
376 public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException {
377 messageDigest.reset();
378 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize));
379 return DigestUtils.updateDigest(messageDigest, data).digest();
380 }
381
382 /**
383 * Reads through a file and returns a generalized Git blob identifier.
384 *
385 * <p>The identifier is computed in the way described by the
386 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
387 * algorithm.</p>
388 *
389 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
390 *
391 * @param messageDigest The MessageDigest to use (for example SHA-1).
392 * @param data Path to the file to digest.
393 * @return A generalized Git blob identifier.
394 * @throws IOException On error accessing the file.
395 */
396 public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException {
397 if (Files.isSymbolicLink(data)) {
398 final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8);
399 return blobId(messageDigest, linkTarget);
400 }
401 messageDigest.reset();
402 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data)));
403 return DigestUtils.updateDigest(messageDigest, data).digest();
404 }
405
406 private static byte[] getGitBlobPrefix(final long dataSize) {
407 return getGitPrefix("blob", dataSize);
408 }
409
410 private static byte[] getGitPrefix(final String type, final long dataSize) {
411 return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
412 }
413
414 private static byte[] getGitTreePrefix(final long dataSize) {
415 return getGitPrefix("tree", dataSize);
416 }
417
418 /**
419 * Reads through a directory and returns a generalized Git tree identifier.
420 *
421 * <p>The identifier is computed in the way described by the
422 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
423 * algorithm.</p>
424 *
425 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
426 *
427 * @param messageDigest The MessageDigest to use (for example SHA-1).
428 * @param data Path to the directory to digest.
429 * @return A generalized Git tree identifier.
430 * @throws IOException On error accessing the directory or its contents.
431 */
432 public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException {
433 return treeIdBuilder(messageDigest).populate(data).get();
434 }
435
436 /**
437 * Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory
438 * structure, such as the contents of an archive.
439 *
440 * <p>The identifier is computed in the way described by the
441 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
442 * algorithm.</p>
443 *
444 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
445 *
446 * @param messageDigest The MessageDigest to use (for example SHA-1).
447 * @return A new {@link TreeIdBuilder}.
448 */
449 public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) {
450 return new TreeIdBuilder(messageDigest);
451 }
452
453 private GitIdentifiers() {
454 // utility class
455 }
456 }