diff --git a/README.md b/README.md index 09fb2d8..e783e3c 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ A simple utility to create tarballs with a specific directory structure. - Cross-platform compatibility (FreeBSD, macOS, Linux) - Produces tarballs compatible with standard tar tools - Verified data integrity with MD5 hash comparison +- Automatically generates and verifies MD5 hashes of all files during extraction ## Building @@ -51,27 +52,47 @@ The tests will: The usage is the same for all binaries: ```bash +# Create a tarball ./bin/tarballer- -source /path/to/directory -output myarchive.tar.gz -prefix myprefix + +# Extract a tarball with integrity verification +./bin/tarballer- -extract -output myarchive.tar.gz -extractdir /path/to/extract ``` -### Options +### Create Mode Options - `-source`: The directory you want to compress (required) - `-output`: The name of the output tarball (defaults to "output.tar.gz") - `-prefix`: The directory name that will contain all files in the tarball (defaults to "myapp") -### Example +### Extract Mode Options + +- `-extract`: Enables extraction mode +- `-output`: The tarball to extract (required) +- `-extractdir`: Directory to extract to (defaults to current directory) +- `-verify`: Only verify hash integrity without extraction +- `-keepmanifest`: Keep the MD5 manifest file after extraction (defaults to removing it) + +### Examples ```bash -# On macOS: +# Create a tarball (on macOS): ./bin/tarballer-darwin -source ./myproject -output release.tar.gz -prefix app -# On FreeBSD: -./bin/tarballer-freebsd -source ./myproject -output release.tar.gz -prefix app +# Extract and verify (on Linux): +./bin/tarballer-linux -extract -output release.tar.gz -extractdir /path/to/extract + +# Only verify hash integrity without extraction: +./bin/tarballer-linux -extract -verify -output release.tar.gz ``` -When extracted, all files will be under the `app/` directory in the tarball and can be extracted with standard tools like: +## MD5 Hash Verification -```bash -tar -xzf release.tar.gz -C /path/to/extract -``` \ No newline at end of file +Tarballer includes built-in file integrity protection: + +1. When creating a tarball, MD5 hashes are calculated for all files and stored in a hidden manifest file (`.md5-manifest.txt`) at the root of the extraction directory +2. During extraction, hashes are verified to ensure files haven't been corrupted or tampered with +3. The manifest file is automatically removed after extraction unless `-keepmanifest` is specified +4. If any file fails verification, the extraction is aborted with an error + +This provides an extra layer of security and data integrity validation compared to standard tar tools. \ No newline at end of file diff --git a/bin/tarballer-darwin b/bin/tarballer-darwin index 00d89e9..9c59428 100755 Binary files a/bin/tarballer-darwin and b/bin/tarballer-darwin differ diff --git a/bin/tarballer-freebsd b/bin/tarballer-freebsd index 1eca1bd..e961143 100755 Binary files a/bin/tarballer-freebsd and b/bin/tarballer-freebsd differ diff --git a/bin/tarballer-linux b/bin/tarballer-linux index a88966b..a127ecd 100755 Binary files a/bin/tarballer-linux and b/bin/tarballer-linux differ diff --git a/main.go b/main.go index 864cf87..c34cb1a 100644 --- a/main.go +++ b/main.go @@ -2,34 +2,84 @@ package main import ( "archive/tar" + "bufio" "compress/gzip" + "crypto/md5" + "encoding/hex" "flag" "fmt" "io" "os" "path/filepath" + "strings" ) +const manifestFilename = ".md5-manifest.txt" + func main() { // Define command line flags sourceDir := flag.String("source", "", "Source directory to compress") outputFile := flag.String("output", "output.tar.gz", "Output tarball filename") prefixDir := flag.String("prefix", "myapp", "Directory prefix in tarball") + extractMode := flag.Bool("extract", false, "Extract mode (instead of create)") + extractDir := flag.String("extractdir", "", "Directory to extract to (default: current directory)") + verifyOnly := flag.Bool("verify", false, "Only verify hash integrity without extraction") + keepManifest := flag.Bool("keepmanifest", false, "Keep the MD5 manifest file after extraction") flag.Parse() - if *sourceDir == "" { - fmt.Println("Please specify a source directory using -source") - flag.Usage() - os.Exit(1) - } + if *extractMode { + if *outputFile == "output.tar.gz" && len(flag.Args()) > 0 { + *outputFile = flag.Args()[0] + } - err := createTarball(*sourceDir, *outputFile, *prefixDir) + if *outputFile == "" { + fmt.Println("Please specify a tarball to extract using -output or as a positional argument") + flag.Usage() + os.Exit(1) + } + + // If extract directory is not specified, use current directory + extractTo := *extractDir + if extractTo == "" { + extractTo = "." + } + + err := extractTarball(*outputFile, extractTo, *verifyOnly, *keepManifest) + if err != nil { + fmt.Printf("Error extracting tarball: %v\n", err) + os.Exit(1) + } + } else { + if *sourceDir == "" { + fmt.Println("Please specify a source directory using -source") + flag.Usage() + os.Exit(1) + } + + err := createTarball(*sourceDir, *outputFile, *prefixDir) + if err != nil { + fmt.Printf("Error creating tarball: %v\n", err) + os.Exit(1) + } + + fmt.Printf("Successfully created %s with prefix %s\n", *outputFile, *prefixDir) + } +} + +// calcFileMD5 calculates the MD5 hash of a file +func calcFileMD5(filePath string) (string, error) { + file, err := os.Open(filePath) if err != nil { - fmt.Printf("Error creating tarball: %v\n", err) - os.Exit(1) + return "", err + } + defer file.Close() + + hash := md5.New() + if _, err := io.Copy(hash, file); err != nil { + return "", err } - fmt.Printf("Successfully created %s with prefix %s\n", *outputFile, *prefixDir) + return hex.EncodeToString(hash.Sum(nil)), nil } func createTarball(sourceDir, outputFile, prefix string) error { @@ -48,6 +98,9 @@ func createTarball(sourceDir, outputFile, prefix string) error { tw := tar.NewWriter(gw) defer tw.Close() + // Create a map to store MD5 hashes + fileHashes := make(map[string]string) + // Resolve absolute source path to handle relative symlinks correctly sourceDir, err = filepath.Abs(sourceDir) if err != nil { @@ -66,6 +119,11 @@ func createTarball(sourceDir, outputFile, prefix string) error { return err } + // Skip the manifest file if it exists (from a previous run) + if relPath == manifestFilename { + return nil + } + // Create tar header using original file info header, err := tar.FileInfoHeader(info, "") if err != nil { @@ -88,6 +146,15 @@ func createTarball(sourceDir, outputFile, prefix string) error { // Make sure the link type is set correctly header.Typeflag = tar.TypeSymlink + + // For symlinks, we don't calculate MD5 hashes + } else if !info.IsDir() { + // Calculate MD5 hash for regular files + hash, err := calcFileMD5(filePath) + if err != nil { + return err + } + fileHashes[filepath.Join(prefix, relPath)] = hash } // Write header @@ -112,5 +179,391 @@ func createTarball(sourceDir, outputFile, prefix string) error { return nil }) - return err + if err != nil { + return err + } + + // Create and add the manifest file + var manifestContent strings.Builder + for path, hash := range fileHashes { + manifestContent.WriteString(fmt.Sprintf("%s %s\n", hash, path)) + } + + // Create a tar header for the manifest + manifestHeader := &tar.Header{ + Name: manifestFilename, + Mode: 0644, + Size: int64(manifestContent.Len()), + Typeflag: tar.TypeReg, + } + + // Write the manifest header + if err := tw.WriteHeader(manifestHeader); err != nil { + return err + } + + // Write the manifest content + if _, err := tw.Write([]byte(manifestContent.String())); err != nil { + return err + } + + return nil +} + +func extractTarball(tarballPath, extractDir string, verifyOnly, keepManifest bool) error { + // Open the tarball + file, err := os.Open(tarballPath) + if err != nil { + return err + } + defer file.Close() + + // Create gzip reader + gr, err := gzip.NewReader(file) + if err != nil { + return err + } + defer gr.Close() + + // Create tar reader + tr := tar.NewReader(gr) + + // Create a map to store expected MD5 hashes from the manifest + expectedHashes := make(map[string]string) + // First pass: Find and parse the manifest file + tempDir := "" + if !verifyOnly { + // Create a temporary directory for extraction + tempDir, err = os.MkdirTemp(extractDir, "tarballer-extract-") + if err != nil { + return err + } + } + + // Extract files to get the manifest + for { + header, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + if tempDir != "" { + os.RemoveAll(tempDir) + } + return err + } + + // Skip directories in first pass, we only want to find the manifest + if header.Typeflag == tar.TypeDir { + continue + } + + // Check if this is the manifest file + if filepath.Base(header.Name) == manifestFilename && filepath.Dir(header.Name) == "." { + // Read the manifest content + var content strings.Builder + if _, err := io.Copy(&content, tr); err != nil { + if tempDir != "" { + os.RemoveAll(tempDir) + } + return err + } + + // Parse the manifest to get expected hashes + scanner := bufio.NewScanner(strings.NewReader(content.String())) + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, " ", 2) + if len(parts) == 2 { + hash := parts[0] + path := parts[1] + expectedHashes[path] = hash + } + } + + if err := scanner.Err(); err != nil { + if tempDir != "" { + os.RemoveAll(tempDir) + } + return fmt.Errorf("error parsing manifest: %v", err) + } + + continue + } + + // If we're only verifying, skip extraction + if verifyOnly { + continue + } + + // Extract to temp dir to verify hashes + target := filepath.Join(tempDir, header.Name) + + // Create directory if needed + if header.Typeflag == tar.TypeDir { + if err := os.MkdirAll(target, 0755); err != nil { + os.RemoveAll(tempDir) + return err + } + continue + } + + // Create parent directory if it doesn't exist + if err := os.MkdirAll(filepath.Dir(target), 0755); err != nil { + os.RemoveAll(tempDir) + return err + } + + // Handle symlinks + if header.Typeflag == tar.TypeSymlink { + if err := os.Symlink(header.Linkname, target); err != nil { + os.RemoveAll(tempDir) + return err + } + continue + } + + // Create regular file + f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY, os.FileMode(header.Mode)) + if err != nil { + os.RemoveAll(tempDir) + return err + } + + // Copy file content + if _, err := io.Copy(f, tr); err != nil { + f.Close() + os.RemoveAll(tempDir) + return err + } + f.Close() + } + + // If no manifest was found + if len(expectedHashes) == 0 { + if tempDir != "" { + os.RemoveAll(tempDir) + } + return fmt.Errorf("no MD5 manifest found in tarball") + } + + // If we're only verifying, we need to reopen the tarball + if verifyOnly { + file.Seek(0, 0) + gr, err = gzip.NewReader(file) + if err != nil { + return err + } + defer gr.Close() + tr = tar.NewReader(gr) + } + + // Second pass: Verify hashes + verificationFailed := false + fileVerified := make(map[string]bool) + + if verifyOnly { + // Extract to temp dir for verification + tempDir, err = os.MkdirTemp(extractDir, "tarballer-verify-") + if err != nil { + return err + } + defer os.RemoveAll(tempDir) + } + + for { + header, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + if tempDir != "" && !verifyOnly { + os.RemoveAll(tempDir) + } + return err + } + + // Skip directories and the manifest file for verification + if header.Typeflag == tar.TypeDir || (filepath.Base(header.Name) == manifestFilename && filepath.Dir(header.Name) == ".") { + continue + } + + // Skip symlinks for hash verification + if header.Typeflag == tar.TypeSymlink { + continue + } + + // Check if this file has an expected hash + expectedHash, exists := expectedHashes[header.Name] + if !exists { + fmt.Printf("Warning: File %s not found in manifest\n", header.Name) + continue + } + + // If verifyOnly, we need to extract the file to verify its hash + if verifyOnly { + target := filepath.Join(tempDir, header.Name) + + // Create parent directory if it doesn't exist + if err := os.MkdirAll(filepath.Dir(target), 0755); err != nil { + return err + } + + // Create file + f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY, os.FileMode(header.Mode)) + if err != nil { + return err + } + + // Copy content + if _, err := io.Copy(f, tr); err != nil { + f.Close() + return err + } + f.Close() + + // Calculate hash + actualHash, err := calcFileMD5(target) + if err != nil { + return err + } + + // Verify hash + if actualHash != expectedHash { + fmt.Printf("Hash mismatch for %s: expected %s, got %s\n", header.Name, expectedHash, actualHash) + verificationFailed = true + } else { + fileVerified[header.Name] = true + } + } else { + // Calculate hash from extracted file + target := filepath.Join(tempDir, header.Name) + actualHash, err := calcFileMD5(target) + if err != nil { + os.RemoveAll(tempDir) + return err + } + + // Verify hash + if actualHash != expectedHash { + fmt.Printf("Hash mismatch for %s: expected %s, got %s\n", header.Name, expectedHash, actualHash) + verificationFailed = true + } else { + fileVerified[header.Name] = true + } + } + } + + // Check if all files in the manifest were verified + for path := range expectedHashes { + if !fileVerified[path] { + fmt.Printf("Warning: File %s in manifest was not found in tarball\n", path) + } + } + + // If verification failed or we're only verifying, we're done + if verificationFailed { + if !verifyOnly { + os.RemoveAll(tempDir) + } + return fmt.Errorf("hash verification failed for one or more files") + } + + if verifyOnly { + fmt.Println("All files verified successfully!") + return nil + } + + // Move the extracted files to the final destination (excluding manifest if needed) + files, err := os.ReadDir(tempDir) + if err != nil { + os.RemoveAll(tempDir) + return err + } + + // Create the final extract directory if it doesn't exist + if err := os.MkdirAll(extractDir, 0755); err != nil { + os.RemoveAll(tempDir) + return err + } + + // Move each top-level extracted item + for _, f := range files { + source := filepath.Join(tempDir, f.Name()) + dest := filepath.Join(extractDir, f.Name()) + + // Skip the manifest file if needed + if !keepManifest && (f.Name() == manifestFilename) { + continue + } + + // If destination already exists, remove it + if _, err := os.Stat(dest); err == nil { + if err := os.RemoveAll(dest); err != nil { + os.RemoveAll(tempDir) + return err + } + } + + // Move the file + if err := os.Rename(source, dest); err != nil { + // If rename fails (e.g., across devices), try copying + err = filepath.Walk(source, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + // Get relative path + relPath, err := filepath.Rel(source, path) + if err != nil { + return err + } + + targetPath := filepath.Join(dest, relPath) + + // Create directory + if info.IsDir() { + return os.MkdirAll(targetPath, info.Mode()) + } + + // Handle symlinks + if info.Mode()&os.ModeSymlink != 0 { + linkTarget, err := os.Readlink(path) + if err != nil { + return err + } + return os.Symlink(linkTarget, targetPath) + } + + // Copy file + srcFile, err := os.Open(path) + if err != nil { + return err + } + defer srcFile.Close() + + // Create destination file + dstFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_WRONLY, info.Mode()) + if err != nil { + return err + } + defer dstFile.Close() + + // Copy content + _, err = io.Copy(dstFile, srcFile) + return err + }) + + if err != nil { + os.RemoveAll(tempDir) + return err + } + } + } + + // Clean up temp directory + os.RemoveAll(tempDir) + + fmt.Println("Extraction completed and verified successfully!") + return nil } diff --git a/test/test.sh b/test/test.sh index af02b7c..672e7bf 100755 --- a/test/test.sh +++ b/test/test.sh @@ -90,10 +90,37 @@ run_basic_test() { # Verify hash comparison echo '=== COMPARING FILE HASHES ===' - ORIG_HASHES=$(cat /test/complex-original-md5.txt | awk '{print $1}' | sort) - EXTR_HASHES=$(cat /test/complex-extracted-md5.txt | grep -v 'complex-app$' | awk '{print $1}' | sort) - - if [ "$ORIG_HASHES" = "$EXTR_HASHES" ]; then + + # Extract just file paths from original hashes + ORIG_FILES=$(cat /test/complex-original-md5.txt | awk '{print $2}' | sort) + + # For each original file, check if its corresponding extracted file has the same hash + ALL_MATCH=1 + for SOURCE_FILE in $ORIG_FILES; do + # Get the base file name + FILENAME=$(basename "$SOURCE_FILE") + + # Find the corresponding hash from original file + ORIG_HASH=$(grep "$SOURCE_FILE" /test/complex-original-md5.txt | awk '{print $1}') + + # Find the corresponding file in the extracted directory and get its hash + EXTRACTED_FILE=$(find /test/complex-extracted -name "$FILENAME" | head -1) + + if [ -z "$EXTRACTED_FILE" ]; then + echo "ERROR: File $FILENAME not found in extracted directory" + ALL_MATCH=0 + continue + fi + + EXTRACTED_HASH=$(grep "$EXTRACTED_FILE" /test/complex-extracted-md5.txt | awk '{print $1}') + + if [ "$ORIG_HASH" != "$EXTRACTED_HASH" ]; then + echo "ERROR: Hash mismatch for $FILENAME: original=$ORIG_HASH extracted=$EXTRACTED_HASH" + ALL_MATCH=0 + fi + done + + if [ "$ALL_MATCH" -eq 1 ]; then echo 'SUCCESS: All file hashes match between original and extracted files!' else echo 'ERROR: Hash mismatch detected!' @@ -165,24 +192,65 @@ run_tar_comparison_test() { # Compare MD5 checksums systematically echo '=== SYSTEMATIC MD5 COMPARISON ===' - # Extract just the hash part from each file - ORIG_HASHES=$(cat /test/original-checksums.txt | awk '{print $1}' | sort) - TARB_HASHES=$(cat /test/standard-checksums.txt | awk '{print $1}' | sort) - REF_HASHES=$(cat /test/reference-checksums.txt | awk '{print $1}' | sort) - - # Compare original to tarballer extraction - if [ "$ORIG_HASHES" = "$TARB_HASHES" ]; then + # Compare original files to tarballer extraction + MATCH_COUNT=0 + EXPECTED_COUNT=$(cat /test/original-checksums.txt | wc -l) + + # For each original file, find its corresponding extracted file and compare hashes + while read -r line; do + ORIG_HASH=$(echo "$line" | awk '{print $1}') + ORIG_FILE=$(echo "$line" | awk '{print $2}') + FILENAME=$(basename "$ORIG_FILE") + + # Find the corresponding file in the extracted directory + EXTRACTED_FILE=$(find /test/standard-extracted -name "$FILENAME" | grep -v ".md5-manifest.txt" | head -1) + + if [ -n "$EXTRACTED_FILE" ]; then + # Get the hash of the extracted file + EXTRACTED_HASH=$(grep "$EXTRACTED_FILE" /test/standard-checksums.txt | awk '{print $1}') + + if [ "$ORIG_HASH" = "$EXTRACTED_HASH" ]; then + MATCH_COUNT=$((MATCH_COUNT + 1)) + else + echo "Hash mismatch for $FILENAME: original=$ORIG_HASH extracted=$EXTRACTED_HASH" + fi + fi + done < /test/original-checksums.txt + + if [ "$MATCH_COUNT" -eq "$EXPECTED_COUNT" ]; then echo 'SUCCESS: Tarballer extraction hashes match original files!' else - echo 'ERROR: Hash mismatch detected between original and tarballer extraction!' + echo "ERROR: Hash mismatch detected! Matched $MATCH_COUNT of $EXPECTED_COUNT files." return 1 fi - # Compare original to reference tar extraction - if [ "$ORIG_HASHES" = "$REF_HASHES" ]; then + # Similar check for reference tar extraction + MATCH_COUNT=0 + + while read -r line; do + ORIG_HASH=$(echo "$line" | awk '{print $1}') + ORIG_FILE=$(echo "$line" | awk '{print $2}') + FILENAME=$(basename "$ORIG_FILE") + + # Find the corresponding file in the extracted directory + EXTRACTED_FILE=$(find /test/reference-extracted -name "$FILENAME" | head -1) + + if [ -n "$EXTRACTED_FILE" ]; then + # Get the hash of the extracted file + EXTRACTED_HASH=$(grep "$EXTRACTED_FILE" /test/reference-checksums.txt | awk '{print $1}') + + if [ "$ORIG_HASH" = "$EXTRACTED_HASH" ]; then + MATCH_COUNT=$((MATCH_COUNT + 1)) + else + echo "Hash mismatch for $FILENAME: original=$ORIG_HASH reference=$EXTRACTED_HASH" + fi + fi + done < /test/original-checksums.txt + + if [ "$MATCH_COUNT" -eq "$EXPECTED_COUNT" ]; then echo 'SUCCESS: Reference tar extraction hashes match original files!' else - echo 'ERROR: Hash mismatch detected between original and reference tar extraction!' + echo "ERROR: Hash mismatch detected! Matched $MATCH_COUNT of $EXPECTED_COUNT files." return 1 fi