mirror of
				https://codeberg.org/h3xx/simplify_static_dir
				synced 2024-08-14 23:57:24 +00:00 
			
		
		
		
	Re-write script
Breaking changes: * Remove support for symlink generation. Bugs fixed: * SHA-1 hash collisions no longer corrupt yer data. Internal changes: * Rework into a helper package
This commit is contained in:
		
							parent
							
								
									c6beadda5f
								
							
						
					
					
						commit
						79e3eca2cb
					
				
					 2 changed files with 301 additions and 174 deletions
				
			
		
							
								
								
									
										10
									
								
								CHANGELOG.md
									
										
									
									
									
								
							
							
						
						
									
										10
									
								
								CHANGELOG.md
									
										
									
									
									
								
							|  | @ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. | |||
| 
 | ||||
| ## [Unreleased] | ||||
| 
 | ||||
| ## [2.0.0] | ||||
| 
 | ||||
| ### Fixed | ||||
| 
 | ||||
| - SHA1 collisions no longer corrupt yer data | ||||
| 
 | ||||
| ### Changed | ||||
| 
 | ||||
| - Remove support for symlink generation | ||||
| 
 | ||||
| ## [1.2.1] | ||||
| 
 | ||||
| - Fixed bug when processing files with \r characters in the name | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| use strict; | ||||
| use warnings; | ||||
| 
 | ||||
| our $VERSION = '1.2.3'; | ||||
| our $VERSION = '2.0.0'; | ||||
| 
 | ||||
| =pod | ||||
| 
 | ||||
|  | @ -60,14 +60,6 @@ Only match file paths matching I<REGEX>. | |||
| 
 | ||||
| Exclude file paths matching I<REGEX>. | ||||
| 
 | ||||
| =item B<-s> | ||||
| 
 | ||||
| Generate symlinks only. | ||||
| 
 | ||||
| =item B<-S> | ||||
| 
 | ||||
| Do not generate ANY symlinks. | ||||
| 
 | ||||
| =item B<-z> | ||||
| 
 | ||||
| Include zero-length files in search. Normally they are ignored (you don't save | ||||
|  | @ -83,8 +75,7 @@ Output version information and exit. | |||
| 
 | ||||
| =back | ||||
| 
 | ||||
| By default, scans the current directory. Files not able to be hard-linked are | ||||
| symlinked by default. | ||||
| By default, scans the current directory. | ||||
| 
 | ||||
| =head1 COPYRIGHT | ||||
| 
 | ||||
|  | @ -107,27 +98,22 @@ require Digest::SHA; | |||
| use Getopt::Std qw/ getopts /; | ||||
| 
 | ||||
| sub HELP_MESSAGE { | ||||
| #   my $fh = shift; | ||||
| #   print $fh <<EOF | ||||
| #Usage: $0 [DIRS] | ||||
| #Simplify a directory by hard-linking identical files. | ||||
| # | ||||
| #  -v           Verbose output. | ||||
| #  -f           Print a sum of the number of freed bytes. | ||||
| #  -m REGEX     Only match file paths matching REGEX. | ||||
| #  -M REGEX     Exclude file paths matching REGEX. | ||||
| #  -s           Generate symlinks only. | ||||
| #  -S           Do not generate ANY symlinks. | ||||
| #  -z           Include zero-length files in search. | ||||
| # | ||||
| #By default, scans the current directory. Files not able to be hard-linked are | ||||
| #symlinked by default. | ||||
| #EOF | ||||
| #; | ||||
|     require Pod::Text; | ||||
|     my ($fh, $pod) = (shift, Pod::Text->new); | ||||
|     $pod->parse_from_file($0, $fh); | ||||
|     my $fh = shift; | ||||
|     print $fh <<EOF | ||||
| Usage: $0 [DIRS] | ||||
| Simplify a directory by hard-linking identical files. | ||||
| 
 | ||||
|   -v            Verbose output. | ||||
|   -f            Print a sum of the number of freed bytes. | ||||
|   -m REGEX      Only match file paths matching REGEX. | ||||
|   -M REGEX      Exclude file paths matching REGEX. | ||||
|   -z            Include zero-length files in search. | ||||
| 
 | ||||
| By default, scans the current directory. | ||||
| 
 | ||||
| See also `perldoc $0' | ||||
| EOF | ||||
| ; | ||||
|     exit 0; | ||||
| } | ||||
| 
 | ||||
|  | @ -136,14 +122,16 @@ my %opts = ( | |||
|     f => 0, | ||||
|     m => '', | ||||
|     M => '', | ||||
|     s => 0, | ||||
|     S => 0, | ||||
|     z => 0, | ||||
| ); | ||||
| 
 | ||||
| &getopts('vfm:M:sSz', \%opts); | ||||
| &getopts('vfm:M:z', \%opts); | ||||
| 
 | ||||
| my %files; | ||||
| my $filehash = new Directory::Simplify::FileHash; | ||||
| 
 | ||||
| # include zero-length files if wanted (`-z') | ||||
| $filehash->min_linkable_size(0) | ||||
|     if $opts{z}; | ||||
| 
 | ||||
| # correct relative paths | ||||
| # OR if no directories given, search the current directory | ||||
|  | @ -153,6 +141,10 @@ my $freed_bytes = 0; | |||
| 
 | ||||
| &find(\&findexec, @ARGV); | ||||
| 
 | ||||
| printf STDERR "freed %d bytes (%0.4G %s)\n", | ||||
|     $freed_bytes, &hr_size($freed_bytes) | ||||
|         if $opts{f} or $opts{v}; | ||||
| 
 | ||||
| sub findexec { | ||||
|     # outright skip directories (don't report skip) | ||||
|     return if -d $File::Find::name; | ||||
|  | @ -180,86 +172,43 @@ sub findexec { | |||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     #my $ctx = Digest::MD5->new; | ||||
|     my $ctx = Digest::SHA->new; | ||||
|     $ctx->addfile($File::Find::name); | ||||
|     my $entry = $filehash->make_entry($File::Find::name); | ||||
| 
 | ||||
|     # save the hex digest because reading the value from | ||||
|     # Digest::* destroys it | ||||
|     my $digest = $ctx->hexdigest; | ||||
| 
 | ||||
|     # organize results from lstat into hash | ||||
|     my $entry = {}; | ||||
|     (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size | ||||
|             atime mtime ctime blksize blocks /}) | ||||
|         = ($File::Find::name, lstat $File::Find::name); | ||||
| 
 | ||||
|     # skip zero-length files if wanted (`-z') | ||||
|     return unless $opts{z} or $entry->{size}; | ||||
| 
 | ||||
|     # check to see if we've come across a file with the same checksum | ||||
|     unless (exists $files{$digest}) { | ||||
|         # the file is unique (as far as we know) | ||||
|         # create a new entry in the hash table | ||||
|         $files{$digest} = $entry; | ||||
|         return; | ||||
|     my @linkable = $filehash->find_hardlinkable($entry); | ||||
|     if (@linkable) { | ||||
|         &hardlink_entries($entry, @linkable); | ||||
|     } | ||||
|     $filehash->add_entry($entry); | ||||
| } | ||||
| 
 | ||||
|     my $curr_entry = $files{$digest}; | ||||
| sub hardlink_entries { | ||||
|     my ($entry, @linkable) = @_; | ||||
| 
 | ||||
|     # don't waste my time | ||||
|     return if $curr_entry->{name} eq $entry->{name} or | ||||
|         $curr_entry->{ino} == $entry->{ino}; | ||||
|     # only one of the linkable entries should suffice | ||||
|     my $linking_with = $linkable[0]; | ||||
| 
 | ||||
|     # identical files should be the same size (I'm paranoid | ||||
|     # of checksumming procedures); if it's not, there's a | ||||
|     # problem with the checksumming procedure or this | ||||
|     # script is processing WAY too many files | ||||
|     # (addendum: perhaps I should congratulate the user on | ||||
|     # finding a collision in SHA-1) | ||||
|     if ($curr_entry->{size} != $entry->{size}) { | ||||
|         die "ERROR: checksums identical for two non-identical files!!!:\n". | ||||
|         "files:\t`$curr_entry->{name}'\n". | ||||
|         "\t`$entry->{name}'\n". | ||||
|         "SHA1: ($digest)\n". | ||||
|         '(this is probably a limit of SHA1; try processing fewer files)'; | ||||
|     } | ||||
| 
 | ||||
|     # find the oldest time stamp | ||||
|     my ($atime, $mtime) = @{(sort | ||||
|         {$a->{mtime} <=> $b->{mtime}} | ||||
|         ($entry, $curr_entry) | ||||
|     # calculate the timestamp of the resulting file | ||||
|     my ($atime, $mtime) = @{( | ||||
|         $filehash->oldest_mtime($entry, $linking_with) | ||||
|     )[0]}{qw/ atime mtime /}; | ||||
| 
 | ||||
|     # find the file less embedded in the file system | ||||
|     my ($less_linked, $more_linked) = sort | ||||
|         {$a->{nlink} <=> $b->{nlink}} | ||||
|         ($entry, $curr_entry); | ||||
|     my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with); | ||||
| 
 | ||||
|     printf STDERR "removing file `%s'\n", $less_linked->{name} | ||||
|         if $opts{v}; | ||||
| 
 | ||||
|     # hard-linkable files must exist on the same device and | ||||
|     # must not already be hard-linked | ||||
|     if ($curr_entry->{dev} == $entry->{dev} && | ||||
|         ! $opts{s}) { | ||||
|         # attempt to unlink the file | ||||
|         printf STDERR "removing file `%s'\n", | ||||
|             $less_linked->{name} if $opts{v}; | ||||
|     unless (unlink $less_linked->{name}) { | ||||
| 
 | ||||
|             # couldn't do it; try more-linked file | ||||
| 
 | ||||
|         printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n", | ||||
|             $less_linked->{name}, | ||||
|             $!, | ||||
|                 $more_linked->{name} | ||||
|             if $opts{v}; | ||||
|             $more_linked->{name}; | ||||
| 
 | ||||
|             # if we can't do this, there's no point | ||||
|             # in continuing | ||||
|         # if we can't do this, there's no point in continuing | ||||
|         unless (unlink $more_linked->{name}) { | ||||
|             printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n", | ||||
|                 $more_linked->{name}, | ||||
|                     $! | ||||
|                     if $opts{v}; | ||||
|                 $!; | ||||
| 
 | ||||
|             return; | ||||
|         } | ||||
|  | @ -267,7 +216,6 @@ sub findexec { | |||
|         # the ol' switcheroo | ||||
|         ($more_linked, $less_linked) = | ||||
|         ($less_linked, $more_linked); | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
|     # we unlinked it or failed out | ||||
|  | @ -282,63 +230,29 @@ sub findexec { | |||
|     link $more_linked->{name}, | ||||
|     $less_linked->{name}; | ||||
| 
 | ||||
|         # update link count in our hash to reflect the | ||||
|         # file system (referenced) | ||||
|     # preserve older time stamp | ||||
|     utime $atime, $mtime, $more_linked->{name}; | ||||
|     $more_linked->{atime} = $atime; | ||||
|     $more_linked->{mtime} = $mtime; | ||||
| 
 | ||||
|     # update link count in our hash to reflect the file system (referenced) | ||||
|     ++$more_linked->{nlink}; | ||||
| 
 | ||||
|         # preserve older time stamp | ||||
|         utime $atime, $mtime, $less_linked->{name}; | ||||
|     } elsif (! $opts{S}) { | ||||
|         # files are on different drives; | ||||
|         # most that can be done is to soft-link them | ||||
| 
 | ||||
|         unless (unlink $less_linked->{name}) { | ||||
| 
 | ||||
|             # couldn't do it; try more-linked file | ||||
| 
 | ||||
|             printf STDERR "couldn't remove file `%s' (using `%s')\n", | ||||
|             $less_linked->{name}, | ||||
|             $more_linked->{name} if $opts{v}; | ||||
| 
 | ||||
|             # if we can't do this, there's no point | ||||
|             # in continuing | ||||
|             unlink $more_linked->{name} | ||||
|                 or return; | ||||
| 
 | ||||
|             # the ol' switcheroo | ||||
|             ($more_linked, $less_linked) = | ||||
|             ($less_linked, $more_linked); | ||||
| 
 | ||||
|     # update old entry with the info from the new one | ||||
|     foreach my $copy_attr (qw/ | ||||
|         ino | ||||
|         nlink | ||||
|         mode | ||||
|         uid | ||||
|         gid | ||||
|         atime | ||||
|         mtime | ||||
|         ctime | ||||
|     /) { | ||||
|         $less_linked->{$copy_attr} = $more_linked->{$copy_attr}; | ||||
|     } | ||||
| 
 | ||||
|         # we unlinked it or failed out | ||||
|         $freed_bytes += $less_linked->{size}; | ||||
| 
 | ||||
|         printf STDERR "soft-linking %s => %s\n", | ||||
|             $less_linked->{name}, $more_linked->{name} | ||||
|         if $opts{v}; | ||||
| 
 | ||||
|         # create a soft link (TODO: relative links) | ||||
|         symlink $more_linked->{name}, | ||||
|         $less_linked->{name}; | ||||
| 
 | ||||
|         # preserve older time stamp | ||||
|         utime $atime, $mtime, $less_linked->{name}; | ||||
|     } | ||||
|     #} elsif (-l $File::Find::name) { | ||||
|     #   # do something to simplify symlinks | ||||
|     #   printf STDERR "FIXME: simplifying symlink `%s'\n", | ||||
|     #   $File::Find::name | ||||
|     #   if $opts{v}; | ||||
| 
 | ||||
|     #   printf STDERR "symlink `%s' points to `%s'\n", | ||||
|     #   $File::Find::name, readlink $File::Find::name; | ||||
| } | ||||
| 
 | ||||
| printf STDERR "freed %d bytes (%0.4G %s)\n", | ||||
|     $freed_bytes, &hr_size($freed_bytes) | ||||
|         if $opts{f} or $opts{v}; | ||||
| 
 | ||||
| sub hr_size { | ||||
|     my $sz = shift; | ||||
|     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; | ||||
|  | @ -357,3 +271,206 @@ sub hr_size { | |||
| 
 | ||||
|     wantarray ? @ret : "@ret" | ||||
| } | ||||
| 
 | ||||
| package Directory::Simplify::FileHash; | ||||
| use strict; | ||||
| use warnings; | ||||
| 
 | ||||
| =head1 DESCRIPTION | ||||
| 
 | ||||
| Object for abstracting management of a hashed filesystem | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub new { | ||||
|     my ($class, $self) = (shift, {}); | ||||
| 
 | ||||
|     $self->{_files} = {}; | ||||
| 
 | ||||
|     require Digest::SHA; | ||||
|     $self->{_ctx} = Digest::SHA->new; | ||||
| 
 | ||||
|     # default options | ||||
|     $self->{_min_linkable_size} = 1; | ||||
| 
 | ||||
|     bless $self, $class | ||||
| } | ||||
| 
 | ||||
| =head2 min_linkable_size($bytes) | ||||
| 
 | ||||
| Set or get the minimum size of files to be considered hard-linkable. Default is 1. | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub min_linkable_size { | ||||
|     my $self = shift; | ||||
|     my $in = shift; | ||||
|     if (defined $in) { | ||||
|         $self->{_min_linkable_size} = $in; | ||||
|     } | ||||
|     $self->{_min_linkable_size} | ||||
| } | ||||
| 
 | ||||
| =head2 make_entry($filename) | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub make_entry { | ||||
|     my $self = shift; | ||||
|     my ($filename) = @_; | ||||
|     # organize results from lstat into hash | ||||
|     my $entry = {}; | ||||
|     (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size | ||||
|             atime mtime ctime blksize blocks /}) | ||||
|         = ($filename, lstat $filename); | ||||
| 
 | ||||
|     $entry->{hash} = $self->_hash($filename); | ||||
| 
 | ||||
|     $entry | ||||
| } | ||||
| 
 | ||||
| =head2 add_entry($entry) | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub add_entry { | ||||
|     my $self = shift; | ||||
|     my ($entry) = @_; | ||||
| 
 | ||||
|     my $hash = $entry->{hash}; | ||||
| 
 | ||||
|     unless (defined $self->{_files}->{$hash}) { | ||||
|         $self->{_files}->{$hash} = []; | ||||
|     } | ||||
|     push @{$self->{_files}->{$hash}}, $entry; | ||||
| } | ||||
| 
 | ||||
| sub find_hardlinkable { | ||||
|     my $self = shift; | ||||
|     my ($entry) = @_; | ||||
| 
 | ||||
|     my $hash = $entry->{hash}; | ||||
| 
 | ||||
|     # no matching entries | ||||
|     unless (defined $self->{_files}->{$hash}) { | ||||
|         return (); | ||||
|     } | ||||
| 
 | ||||
|     my @matches; | ||||
|     foreach my $ent (@{$self->{_files}->{$hash}}) { | ||||
|         if ($self->_entries_are_hard_linkable($entry, $ent)) { | ||||
|             push @matches, $ent; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     return @matches; | ||||
| 
 | ||||
| } | ||||
| 
 | ||||
| =head2 oldest($entry_a, $entry_b, ...) | ||||
| 
 | ||||
| Find the file less embedded in the file system. | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub less_linked { | ||||
|     my $self = shift; | ||||
|     return sort | ||||
|         {$a->{nlink} <=> $b->{nlink}} | ||||
|         @_; | ||||
| } | ||||
| 
 | ||||
| =head2 oldest($entry_a, $entry_b, ...) | ||||
| 
 | ||||
| Find the entry with the oldest time stamp. | ||||
| 
 | ||||
| =cut | ||||
| 
 | ||||
| sub oldest_mtime { | ||||
|     my $self = shift; | ||||
| 
 | ||||
|     return sort | ||||
|         {$a->{mtime} <=> $b->{mtime}} | ||||
|         @_; | ||||
| } | ||||
| 
 | ||||
| sub _hash { | ||||
|     my $self = shift; | ||||
|     my ($filename) = @_; | ||||
|     $self->{_ctx}->addfile($filename); | ||||
|     return $self->{_ctx}->hexdigest; | ||||
| } | ||||
| 
 | ||||
| sub _entries_are_hard_linkable { | ||||
|     my $self = shift; | ||||
|     my ($entry_a, $entry_b) = @_; | ||||
| 
 | ||||
|     # obviously, if the sizes aren't the same, they're not the same file | ||||
|     unless (&_entries_sizes_match($entry_a, $entry_b)) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     # too small to be hard-linked | ||||
|     if ($entry_a->{size} < $self->min_linkable_size) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     # they're the same file, don't try it | ||||
|     if (&_entries_are_same_filename($entry_a, $entry_b)) { | ||||
|         return 0; | ||||
|     } | ||||
|     if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if (&_entries_contents_match($entry_a, $entry_b)) { | ||||
|         return 1; | ||||
|     } | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| sub _entries_are_same_filename { | ||||
|     my ($entry_a, $entry_b) = @_; | ||||
| 
 | ||||
|     if ($entry_a->{name} eq $entry_b->{name}) { | ||||
|         return 1; | ||||
|     } | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| sub _entries_are_already_hard_linked { | ||||
|     my ($entry_a, $entry_b) = @_; | ||||
| 
 | ||||
|     if ($entry_a->{ino} == $entry_b->{ino} and | ||||
|         $entry_a->{dev} == $entry_b->{dev}) { | ||||
|         return 1; | ||||
|     } | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| sub _entries_sizes_match { | ||||
|     my ($entry_a, $entry_b) = @_; | ||||
|     if ($entry_a->{size} != $entry_b->{size}) { | ||||
|         return 0; | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| sub _entries_contents_match { | ||||
|     my ($entry_a, $entry_b) = @_; | ||||
| 
 | ||||
|     # also, if the hashes aren't the same, they cannot be the same file | ||||
|     if ($entry_a->{hash} ne $entry_b->{hash}) { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     use File::Compare   qw/ compare /; | ||||
|     my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name})); | ||||
| 
 | ||||
|     # warn about hash collision | ||||
|     unless ($contents_same) { | ||||
|         warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n  (don't panic)\n"; | ||||
|     } | ||||
|     return $contents_same; | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue