Add exclude patterns functionality with tests and documentation

2025-03-21 14:56:47 -04:00 · 2025-03-21 14:56:47 -04:00 · fdc692c90c
parent b546e84afd
commit fdc692c90c
7 changed files with 319 additions and 68 deletions
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ A utility to create tarballs with a specific directory structure and built-in MD
 - Compatible with standard tar tools
 - Built-in MD5 hash verification
 - Automatic file integrity checks during extraction
+- Pattern-based file exclusion for creating targeted archives

 ## Building

@ -56,6 +57,8 @@ Options:
 - `-source`: Source directory to compress (required)
 - `-output`: Output tarball filename (default: "output.tar.gz")
 - `-prefix`: Directory name that will contain all files in the tarball (default: "myapp")
+- `-exclude`: Comma-separated list of patterns to exclude (e.g. "*.log,*.tmp,temp/")
+- `-verbose`: Enable detailed output during operation

 ### Extract Mode
 ```bash
@ -67,6 +70,7 @@ Options:
 - `-output`: Tarball to extract (required)
 - `-extractdir`: Extraction directory (default: current directory)
 - `-verify`: Only verify hash integrity without extraction
+- `-verbose`: Enable detailed output during operation

 ### Examples

@ -74,9 +78,15 @@ Options:
 # Create a tarball (macOS)
 ./bin/tarballer-darwin -source ./myproject -output release.tar.gz -prefix app

+# Create a tarball excluding specific files
+./bin/tarballer-darwin -source ./myproject -output release.tar.gz -prefix app -exclude "*.log,bin/,temp/"
+
 # Extract and verify (Linux)
 ./bin/tarballer-linux -extract -output release.tar.gz -extractdir /path/to/extract

+# Extract with verbose output
+./bin/tarballer-linux -extract -output release.tar.gz -extractdir /path/to/extract -verbose
+
 # Only verify integrity
 ./bin/tarballer-linux -extract -verify -output release.tar.gz
 ```
@ -87,3 +97,17 @@ Options:
 2. During extraction, file hashes are verified against the manifest
 3. The manifest file is removed after successful extraction
 4. Extraction aborts with an error if verification fails 
+
+## Exclude Patterns
+
+The `-exclude` flag accepts a comma-separated list of patterns to exclude from the tarball:
+
+- Simple wildcards using `*` (matches any sequence of characters) and `?` (matches any single character)
+- Directory patterns (ending with `/`) exclude entire directory trees
+- File patterns can match by extension (e.g., `*.log`) or name
+
+Examples:
+- `*.log` - Excludes all files with the .log extension
+- `bin/` - Excludes the bin directory and all its contents
+- `temp/,*.tmp` - Excludes the temp directory and all .tmp files
+- `cache/*,*.bak` - Excludes all contents of the cache directory and all .bak files 
--- a/bin/tarballer-darwin
+++ b/bin/tarballer-darwin
--- a/bin/tarballer-freebsd
+++ b/bin/tarballer-freebsd
--- a/bin/tarballer-linux
+++ b/bin/tarballer-linux
--- a/main.go
+++ b/main.go
@ -11,6 +11,7 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"time"
 )
@ -26,6 +27,7 @@ func main() {
 	extractDir := flag.String("extractdir", "", "Directory to extract to (default: current directory)")
 	verifyOnly := flag.Bool("verify", false, "Only verify hash integrity without extraction")
 	verboseMode := flag.Bool("verbose", false, "Enable verbose output")
+	excludePatterns := flag.String("exclude", "", "Comma-separated list of patterns to exclude (e.g. \"*.log,*.tmp,temp/\")")
 	flag.Parse()

 	if *extractMode {
@ -58,7 +60,29 @@ func main() {

 		}

-		err := createTarball(*sourceDir, *outputFile, *prefixDir, *verboseMode)
+		// Process exclude patterns
+		var excludeRegexps []*regexp.Regexp
+		if *excludePatterns != "" {
+			patterns := strings.Split(*excludePatterns, ",")
+			for _, pattern := range patterns {
+				// Trim spaces
+				pattern = strings.TrimSpace(pattern)
+				if pattern == "" {
+					continue
+				}
+
+				// Convert glob pattern to regexp
+				regexPattern := globToRegexp(pattern)
+				re, err := regexp.Compile(regexPattern)
+				if err != nil {
+					fmt.Printf("Invalid exclude pattern %q: %v\n", pattern, err)
+					os.Exit(1)
+				}
+				excludeRegexps = append(excludeRegexps, re)
+			}
+		}
+
+		err := createTarball(*sourceDir, *outputFile, *prefixDir, excludeRegexps, *verboseMode)
 		if err != nil {
 			fmt.Printf("Error creating tarball: %v\n", err)
 			os.Exit(1)
@ -68,6 +92,29 @@ func main() {
 	}
 }

+// globToRegexp converts a glob pattern (*.log) to a regexp pattern (.*\.log$)
+func globToRegexp(pattern string) string {
+	// Escape special regexp chars that aren't special in glob
+	pattern = regexp.QuoteMeta(pattern)
+
+	// Convert glob * to regex .*
+	pattern = strings.ReplaceAll(pattern, "\\*", ".*")
+
+	// Convert glob ? to regex .
+	pattern = strings.ReplaceAll(pattern, "\\?", ".")
+
+	// Ensure pattern matches the entire filename
+	if strings.HasSuffix(pattern, "/") {
+		// For directory patterns, match any path containing this directory
+		pattern = pattern + ".*"
+	} else {
+		// For file patterns, match at the end of the path
+		pattern = pattern + "$"
+	}
+
+	return pattern
+}
+
 // calcFileMD5 calculates the MD5 hash of a file
 func calcFileMD5(filePath string) (string, error) {
 	file, err := os.Open(filePath)
@ -84,7 +131,7 @@ func calcFileMD5(filePath string) (string, error) {
 	return hex.EncodeToString(hash.Sum(nil)), nil
 }

-func createTarball(sourceDir, outputFile, prefix string, verboseMode bool) error {
+func createTarball(sourceDir, outputFile, prefix string, excludePatterns []*regexp.Regexp, verboseMode bool) error {
 	// Resolve absolute path of source directory
 	absSourceDir, err := filepath.Abs(sourceDir)
 	if err != nil {
@ -97,6 +144,12 @@ func createTarball(sourceDir, outputFile, prefix string, verboseMode bool) error
 		if prefix != "" {
 			fmt.Printf("Using prefix: %s\n", prefix)
 		}
+		if len(excludePatterns) > 0 {
+			fmt.Println("Using exclude patterns:")
+			for i, pattern := range excludePatterns {
+				fmt.Printf("  %d: %s\n", i+1, pattern)
+			}
+		}
 	}

 	// Create the output file
@ -120,6 +173,7 @@ func createTarball(sourceDir, outputFile, prefix string, verboseMode bool) error
 	// Create a map to store file hashes
 	hashes := make(map[string]string)
 	fileCount := 0
+	skippedCount := 0

 	// Walk through the source directory
 	err = filepath.Walk(absSourceDir, func(path string, info os.FileInfo, err error) error {
@ -143,6 +197,23 @@ func createTarball(sourceDir, outputFile, prefix string, verboseMode bool) error
 			return nil
 		}

+		// Check if file matches any exclude patterns
+		if len(excludePatterns) > 0 {
+			relPathForward := filepath.ToSlash(relPath)
+			for _, pattern := range excludePatterns {
+				if pattern.MatchString(relPathForward) {
+					if verboseMode {
+						fmt.Printf("Excluding: %s (matched pattern)\n", relPathForward)
+					}
+					skippedCount++
+					if info.IsDir() {
+						return filepath.SkipDir
+					}
+					return nil
+				}
+			}
+		}
+
 		// Add prefix if specified
 		if prefix != "" {
 			relPath = filepath.Join(prefix, relPath)
@ -233,7 +304,10 @@ func createTarball(sourceDir, outputFile, prefix string, verboseMode bool) error
 	}

 	if verboseMode {
-		fmt.Printf("Added %d files to tarball\n", fileCount)
+		fmt.Printf("Added %d files to tarball\n", fileCount-skippedCount)
+		if skippedCount > 0 {
+			fmt.Printf("Excluded %d files/directories\n", skippedCount)
+		}
 		fmt.Println("Creating MD5 manifest...")
 	}

--- a/test/README.md
+++ b/test/README.md
@ -57,10 +57,47 @@ If you need to manually test the tarballer utility, you can:
   # Create a tarball
   ./bin/tarballer-darwin -source /path/to/source -output output.tar.gz -prefix myapp
   
+   # Testing exclude patterns
+   ./bin/tarballer-darwin -source /path/to/source -output output.tar.gz -prefix myapp -exclude "*.log,temp/"
+   
+   # Test with verbose output
+   ./bin/tarballer-darwin -source /path/to/source -output output.tar.gz -prefix myapp -verbose
+   
   # Extract and verify a tarball
   ./bin/tarballer-darwin -extract -output output.tar.gz -extractdir /path/to/extract
   ```

+## Testing Exclude Patterns
+
+To test the exclude patterns feature:
+
+1. Create a directory with various file types:
+   ```bash
+   mkdir -p test-dir/logs test-dir/bin test-dir/src
+   touch test-dir/file1.txt test-dir/file2.txt
+   touch test-dir/logs/app.log test-dir/logs/error.log
+   touch test-dir/bin/executable
+   touch test-dir/src/main.go test-dir/src/util.go
+   ```
+
+2. Test with various exclude patterns:
+   ```bash
+   # Exclude all .log files
+   ./bin/tarballer-darwin -source test-dir -output test1.tar.gz -prefix test -exclude "*.log" -verbose
+
+   # Exclude entire directories
+   ./bin/tarballer-darwin -source test-dir -output test2.tar.gz -prefix test -exclude "logs/,bin/" -verbose
+
+   # Exclude multiple patterns
+   ./bin/tarballer-darwin -source test-dir -output test3.tar.gz -prefix test -exclude "*.log,*.go,bin/" -verbose
+   ```
+
+3. Extract and verify that exclusions worked:
+   ```bash
+   ./bin/tarballer-darwin -extract -output test1.tar.gz -extractdir test1-extracted -verbose
+   find test1-extracted -type f | grep ".log"  # Should return nothing
+   ```
+
 ## Modifying Tests

 When modifying tests, keep in mind that the test script uses the container's `/tmp` directory for all temporary files. This keeps the test process self-contained within the container. 
--- a/test/test.sh
+++ b/test/test.sh
@ -27,6 +27,8 @@ cleanup_files() {
    rm -rf /tmp/standard-test /tmp/standard-extracted /tmp/reference-extracted
    rm -f /tmp/original-checksums.txt /tmp/standard-checksums.txt /tmp/reference-checksums.txt
    rm -f /workdir/complex.tar.gz /workdir/standard.tar.gz /workdir/reference.tar.gz
+    rm -rf /tmp/exclude-test /tmp/exclude-extracted
+    rm -f /workdir/exclude.tar.gz
    echo "Temporary files cleaned up"
  else
    echo "Keeping temporary files for inspection"
@ -160,6 +162,113 @@ run_basic_test() {
  return 0
 }

+run_exclude_patterns_test() {
+  echo "=== RUNNING EXCLUDE PATTERNS TEST ==="
+  
+  # Clean up test directories
+  rm -rf /tmp/exclude-test /tmp/exclude-extracted
+  rm -f /workdir/exclude.tar.gz
+  
+  # Create test directory structure
+  mkdir -p /tmp/exclude-test/logs /tmp/exclude-test/temp
+  mkdir -p /tmp/exclude-test/src/lib /tmp/exclude-test/bin
+  mkdir -p /tmp/exclude-test/data
+  
+  # Create various file types
+  echo "Main text file" > /tmp/exclude-test/main.txt
+  echo "Config file" > /tmp/exclude-test/config.ini
+  
+  # Log files (to be excluded with pattern *.log)
+  echo "Log file 1" > /tmp/exclude-test/logs/app.log
+  echo "Log file 2" > /tmp/exclude-test/logs/error.log
+  
+  # Temporary files (to be excluded with pattern temp/)
+  echo "Temp file 1" > /tmp/exclude-test/temp/cache.tmp
+  echo "Temp file 2" > /tmp/exclude-test/temp/session.tmp
+  
+  # Source files (some to be excluded with pattern *.go)
+  echo "Source file Go" > /tmp/exclude-test/src/main.go
+  echo "Source file C" > /tmp/exclude-test/src/helper.c
+  echo "Source file Go lib" > /tmp/exclude-test/src/lib/utils.go
+  echo "Source file C lib" > /tmp/exclude-test/src/lib/core.c
+  
+  # Binary files (to be excluded with pattern bin/)
+  generate_random_file "/tmp/exclude-test/bin/app" 1024
+  generate_random_file "/tmp/exclude-test/bin/tool" 512
+  
+  # Data files (not to be excluded)
+  generate_random_file "/tmp/exclude-test/data/data1.bin" 256
+  generate_random_file "/tmp/exclude-test/data/data2.bin" 128
+  
+  # List original structure
+  echo "=== ORIGINAL STRUCTURE ==="
+  find /tmp/exclude-test -type f | sort
+  echo "Total files: $(find /tmp/exclude-test -type f | wc -l)"
+  
+  # Test excluding *.log files
+  echo "=== TEST 1: EXCLUDING *.log FILES ==="
+  /bin/tarballer -source /tmp/exclude-test -output /workdir/exclude1.tar.gz -prefix test -exclude "*.log" -verbose
+  
+  mkdir -p /tmp/exclude-extracted/test1
+  tar -xzf /workdir/exclude1.tar.gz -C /tmp/exclude-extracted/test1
+  
+  echo "=== EXTRACTED STRUCTURE (WITHOUT LOGS) ==="
+  find /tmp/exclude-extracted/test1 -type f | sort
+  echo "Total files: $(find /tmp/exclude-extracted/test1 -type f | wc -l)"
+  
+  # Check that no .log files exist in the extracted archive
+  LOG_FILES=$(find /tmp/exclude-extracted/test1 -name "*.log" | wc -l)
+  if [ "$LOG_FILES" -eq 0 ]; then
+    echo "SUCCESS: No .log files found in the extracted archive"
+  else
+    echo "ERROR: Found .log files in the extracted archive"
+    return 1
+  fi
+  
+  # Test excluding directories
+  echo "=== TEST 2: EXCLUDING DIRECTORIES (temp/ and bin/) ==="
+  /bin/tarballer -source /tmp/exclude-test -output /workdir/exclude2.tar.gz -prefix test -exclude "temp/,bin/" -verbose
+  
+  mkdir -p /tmp/exclude-extracted/test2
+  tar -xzf /workdir/exclude2.tar.gz -C /tmp/exclude-extracted/test2
+  
+  echo "=== EXTRACTED STRUCTURE (WITHOUT temp/ AND bin/) ==="
+  find /tmp/exclude-extracted/test2 -type f | sort
+  echo "Total files: $(find /tmp/exclude-extracted/test2 -type f | wc -l)"
+  
+  # Check that the excluded directories don't exist in the extracted archive
+  EXCLUDED_DIRS=$(find /tmp/exclude-extracted/test2 -path "*/temp" -o -path "*/bin" | wc -l)
+  if [ "$EXCLUDED_DIRS" -eq 0 ]; then
+    echo "SUCCESS: No temp/ or bin/ directories found in the extracted archive"
+  else
+    echo "ERROR: Found excluded directories in the extracted archive"
+    return 1
+  fi
+  
+  # Test excluding multiple patterns
+  echo "=== TEST 3: EXCLUDING MULTIPLE PATTERNS (*.log, *.go, bin/) ==="
+  /bin/tarballer -source /tmp/exclude-test -output /workdir/exclude3.tar.gz -prefix test -exclude "*.log,*.go,bin/" -verbose
+  
+  mkdir -p /tmp/exclude-extracted/test3
+  tar -xzf /workdir/exclude3.tar.gz -C /tmp/exclude-extracted/test3
+  
+  echo "=== EXTRACTED STRUCTURE (WITH MULTIPLE EXCLUSIONS) ==="
+  find /tmp/exclude-extracted/test3 -type f | sort
+  echo "Total files: $(find /tmp/exclude-extracted/test3 -type f | wc -l)"
+  
+  # Check all exclusions
+  EXCLUDED_FILES=$(find /tmp/exclude-extracted/test3 -name "*.log" -o -name "*.go" -o -path "*/bin/*" | wc -l)
+  if [ "$EXCLUDED_FILES" -eq 0 ]; then
+    echo "SUCCESS: All excluded patterns are working correctly"
+  else
+    echo "ERROR: Found files that should have been excluded"
+    return 1
+  fi
+  
+  echo "Exclude patterns test completed successfully!"
+  return 0
+}
+
 run_tar_comparison_test() {
  echo "=== RUNNING TAR COMPARISON TEST ==="
  
@ -248,121 +357,128 @@ EOF
  # For each original file, find its corresponding extracted file and compare hashes
  while read -r line; do
    ORIG_HASH=$(echo "$line" | awk '{print $1}')
-    ORIG_FILE=$(echo "$line" | awk '{print $2}')
+    ORIG_FILE=$(echo "$line" | awk '{$1=""; print $0}' | sed 's/^ //')
    FILENAME=$(basename "$ORIG_FILE")
    
-    # Find the corresponding file in the extracted directory
-    EXTRACTED_FILE=$(find /tmp/standard-extracted -name "$FILENAME" | grep -v ".md5-manifest.txt" | head -1)
-    
+    # Look for the same file in the tarballer output
+    EXTRACTED_FILE=$(find /tmp/standard-extracted -name "$FILENAME" | head -1)
    if [ -n "$EXTRACTED_FILE" ]; then
-      # Get the hash of the extracted file
      EXTRACTED_HASH=$(grep "$EXTRACTED_FILE" /tmp/standard-checksums.txt | awk '{print $1}')
-      
      if [ "$ORIG_HASH" = "$EXTRACTED_HASH" ]; then
        MATCH_COUNT=$((MATCH_COUNT + 1))
      else
-        echo "Hash mismatch for $FILENAME: original=$ORIG_HASH extracted=$EXTRACTED_HASH"
+        echo "HASH MISMATCH: $FILENAME"
+        echo "Original: $ORIG_HASH"
+        echo "Extracted: $EXTRACTED_HASH"
      fi
+    else
+      echo "File not found in extraction: $FILENAME"
    fi
  done < /tmp/original-checksums.txt
  
  if [ "$MATCH_COUNT" -eq "$EXPECTED_COUNT" ]; then
-    echo 'SUCCESS: Tarballer extraction hashes match original files!'
+    echo "SUCCESS: Tarballer extraction hashes match original files!"
  else
-    echo "ERROR: Hash mismatch detected! Matched $MATCH_COUNT of $EXPECTED_COUNT files."
+    echo "ERROR: Only $MATCH_COUNT of $EXPECTED_COUNT hashes match"
    return 1
  fi
  
-  # Similar check for reference tar extraction
+  # Compare original files to standard tar extraction
  MATCH_COUNT=0
  
+  # For each original file, find its corresponding extracted file and compare hashes
  while read -r line; do
    ORIG_HASH=$(echo "$line" | awk '{print $1}')
-    ORIG_FILE=$(echo "$line" | awk '{print $2}')
+    ORIG_FILE=$(echo "$line" | awk '{$1=""; print $0}' | sed 's/^ //')
    FILENAME=$(basename "$ORIG_FILE")
    
-    # Find the corresponding file in the extracted directory
+    # Look for the same file in the reference output
    EXTRACTED_FILE=$(find /tmp/reference-extracted -name "$FILENAME" | head -1)
-    
    if [ -n "$EXTRACTED_FILE" ]; then
-      # Get the hash of the extracted file
      EXTRACTED_HASH=$(grep "$EXTRACTED_FILE" /tmp/reference-checksums.txt | awk '{print $1}')
-      
      if [ "$ORIG_HASH" = "$EXTRACTED_HASH" ]; then
        MATCH_COUNT=$((MATCH_COUNT + 1))
      else
-        echo "Hash mismatch for $FILENAME: original=$ORIG_HASH reference=$EXTRACTED_HASH"
+        echo "HASH MISMATCH (ref): $FILENAME"
+        echo "Original: $ORIG_HASH"
+        echo "Extracted: $EXTRACTED_HASH"
      fi
+    else
+      echo "File not found in reference extraction: $FILENAME"
    fi
  done < /tmp/original-checksums.txt
  
  if [ "$MATCH_COUNT" -eq "$EXPECTED_COUNT" ]; then
-    echo 'SUCCESS: Reference tar extraction hashes match original files!'
+    echo "SUCCESS: Reference tar extraction hashes match original files!"
  else
-    echo "ERROR: Hash mismatch detected! Matched $MATCH_COUNT of $EXPECTED_COUNT files."
+    echo "ERROR: Only $MATCH_COUNT of $EXPECTED_COUNT hashes match"
    return 1
  fi
  
-  echo '=== VERIFYING SYMLINKS ==='
-  echo 'ORIGINAL SYMLINKS:'
+  # Test symlinks
+  echo "=== VERIFYING SYMLINKS ==="
+  echo "ORIGINAL SYMLINKS:"
  find /tmp/standard-test -type l -exec ls -la {} \;
-  echo 'EXTRACTED SYMLINKS:'
+  
+  echo "EXTRACTED SYMLINKS:"
  find /tmp/standard-extracted -type l -exec ls -la {} \;
  
-  # Compare file counts to ensure all files were extracted
-  echo '=== FILE COUNT COMPARISON ==='
-  echo -n 'Original files: ' && find /tmp/standard-test -type f | wc -l
-  echo -n 'Extracted files: ' && find /tmp/standard-extracted -type f | wc -l
+  # Verify file counts
+  echo "=== FILE COUNT COMPARISON ==="
+  echo "Original files: $(find /tmp/standard-test -type f | wc -l)"
+  echo "Extracted files: $(find /tmp/standard-extracted -type f | wc -l)"
  
-  # Test symlink functionality
-  echo '=== TESTING SYMLINK CONTENT ==='
-  echo 'Original linked content:'
+  # Test symlink content
+  echo "=== TESTING SYMLINK CONTENT ==="
+  echo "Original linked content:"
  cat /tmp/standard-test/data/config-link.json
-  echo 'Extracted linked content:'
+  
+  echo "Extracted linked content:"
  cat /tmp/standard-extracted/app/data/config-link.json
  
-  echo 'Tar comparison test completed successfully!'
+  echo "Tar comparison test completed successfully!"
  return 0
 }

-# Main script execution
+# Main script logic
+echo "=== RUNNING ALL TESTS ==="
+
 case "$TEST_TYPE" in
  "basic") 
    run_basic_test
    RESULT=$?
-    [ "$RESULT" -eq 0 ] && cleanup_files
-    exit $RESULT
    ;;
  "tar") 
    run_tar_comparison_test
    RESULT=$?
-    [ "$RESULT" -eq 0 ] && cleanup_files
-    exit $RESULT
    ;;
-  "all")
-    echo "=== RUNNING ALL TESTS ==="
-    run_basic_test
-    BASIC_RESULT=$?
-    run_tar_comparison_test
-    TAR_RESULT=$?
-    
-    if [ $BASIC_RESULT -eq 0 ] && [ $TAR_RESULT -eq 0 ]; then
-      echo "✅ ALL TESTS PASSED SUCCESSFULLY!"
-      cleanup_files
-      exit 0
-    else
-      echo "❌ SOME TESTS FAILED!"
-      exit 1
-    fi
+  "exclude")
+    run_exclude_patterns_test
+    RESULT=$?
    ;;
  "clean")
    cleanup_files
-    exit 0
+    RESULT=0
    ;;
-  *)
-    echo "Unknown test type: $TEST_TYPE"
-    echo "Usage: $0 [basic|tar|all|clean] [keep_temp_files]"
-    echo "  keep_temp_files: 0 (clean up, default) or 1 (keep temp files)"
-    exit 1
+  "all"|*)
+    run_basic_test
+    RESULT1=$?
+    run_tar_comparison_test
+    RESULT2=$?
+    run_exclude_patterns_test
+    RESULT3=$?
+    RESULT=$((RESULT1 + RESULT2 + RESULT3))
    ;;
 esac
+
+if [ "$TEST_TYPE" != "clean" ]; then
+  cleanup_files
+fi
+
+if [ "$RESULT" -eq 0 ]; then
+  echo "✅ ALL TESTS PASSED SUCCESSFULLY!"
+  exit 0
+else
+  echo "❌ TESTS FAILED WITH ERRORS!"
+  exit 1
+fi