mirror of
				https://codeberg.org/h3xx/simplify_static_dir
				synced 2024-08-14 23:57:24 +00:00 
			
		
		
		
	Update script
Follows a more logical process of deciding what to do, then doing it.
This commit is contained in:
		
							parent
							
								
									7d19ccd876
								
							
						
					
					
						commit
						c77fc7a205
					
				
					 2 changed files with 304 additions and 253 deletions
				
			
		|  | @ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. | ||||||
| 
 | 
 | ||||||
| ## [Unreleased] | ## [Unreleased] | ||||||
| 
 | 
 | ||||||
|  | ## [3.0.0] | ||||||
|  | 
 | ||||||
|  | ### Changed | ||||||
|  | 
 | ||||||
|  | - Re-order operations to make the linking happen at the very end | ||||||
|  | 
 | ||||||
| ## [2.0.0] | ## [2.0.0] | ||||||
| 
 | 
 | ||||||
| ### Fixed | ### Fixed | ||||||
|  |  | ||||||
|  | @ -1,9 +1,12 @@ | ||||||
| #!/usr/bin/perl | #!/usr/bin/perl | ||||||
| # vi: et sts=4 sw=4 ts=4 | # vi: et sts=4 sw=4 ts=4 | ||||||
|  | 
 | ||||||
|  | package main; | ||||||
| use strict; | use strict; | ||||||
| use warnings; | use warnings; | ||||||
|  | require Cwd; | ||||||
| 
 | 
 | ||||||
| our $VERSION = '2.0.0'; | our $VERSION = '3.0.0'; | ||||||
| 
 | 
 | ||||||
| =pod | =pod | ||||||
| 
 | 
 | ||||||
|  | @ -117,142 +120,86 @@ EOF | ||||||
|     exit 0; |     exit 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| my %opts = ( | MAIN: { | ||||||
|     v => 0, |     my %opts = ( | ||||||
|     f => 0, |         v => 0, | ||||||
|     m => '', |         f => 0, | ||||||
|     M => '', |         m => '', | ||||||
|     z => 0, |         M => '', | ||||||
| ); |         z => 0, | ||||||
|  |     ); | ||||||
| 
 | 
 | ||||||
| &getopts('vfm:M:z', \%opts); |     &getopts('vfm:M:z', \%opts); | ||||||
| 
 | 
 | ||||||
| my $filehash = new Directory::Simplify::FileHash; |     # correct relative paths | ||||||
|  |     # OR if no directories given, search the current directory | ||||||
|  |     my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD})); | ||||||
| 
 | 
 | ||||||
| # include zero-length files if wanted (`-z') |     my @files; | ||||||
| $filehash->min_linkable_size(0) |     print STDERR 'Finding files...' | ||||||
|     if $opts{z}; |  | ||||||
| 
 |  | ||||||
| # correct relative paths |  | ||||||
| # OR if no directories given, search the current directory |  | ||||||
| push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV; |  | ||||||
| 
 |  | ||||||
| my $freed_bytes = 0; |  | ||||||
| 
 |  | ||||||
| &find(\&findexec, @ARGV); |  | ||||||
| 
 |  | ||||||
| printf STDERR "freed %d bytes (%0.4G %s)\n", |  | ||||||
|     $freed_bytes, &hr_size($freed_bytes) |  | ||||||
|         if $opts{f} or $opts{v}; |  | ||||||
| 
 |  | ||||||
| sub findexec { |  | ||||||
|     # outright skip directories (don't report skip) |  | ||||||
|     return if -d $File::Find::name; |  | ||||||
| 
 |  | ||||||
|     # limit to or exclude file patterns specified by `-m' or `-M', |  | ||||||
|     # respectively |  | ||||||
| 
 |  | ||||||
|     # truth table |  | ||||||
|     # -m matches    | -M is used & matches  | !return? |  | ||||||
|     # 0     | 0         | 0 |  | ||||||
|     # 0     | 1         | 0 |  | ||||||
|     # 1     | 0         | 1 |  | ||||||
|     # 1     | 1         | 0 |  | ||||||
|     # note: m// will match everything |  | ||||||
|     unless ($File::Find::name =~ m/$opts{m}/ and |  | ||||||
|         !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) { |  | ||||||
| 
 |  | ||||||
|         print STDERR "Skipping path `$File::Find::name'\n" |  | ||||||
|             if $opts{v}; |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     # skip non-existent files and links |  | ||||||
|     unless (-f $File::Find::name && ! -l $File::Find::name) { |  | ||||||
|         return; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     my $entry = $filehash->make_entry($File::Find::name); |  | ||||||
| 
 |  | ||||||
|     my @linkable = $filehash->find_hardlinkable($entry); |  | ||||||
|     if (@linkable) { |  | ||||||
|         &hardlink_entries($entry, @linkable); |  | ||||||
|     } |  | ||||||
|     $filehash->add_entry($entry); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| sub hardlink_entries { |  | ||||||
|     my ($entry, @linkable) = @_; |  | ||||||
| 
 |  | ||||||
|     # only one of the linkable entries should suffice |  | ||||||
|     my $linking_with = $linkable[0]; |  | ||||||
| 
 |  | ||||||
|     # calculate the timestamp of the resulting file |  | ||||||
|     my ($atime, $mtime) = @{( |  | ||||||
|         $filehash->oldest_mtime($entry, $linking_with) |  | ||||||
|     )[0]}{qw/ atime mtime /}; |  | ||||||
| 
 |  | ||||||
|     # find the file less embedded in the file system |  | ||||||
|     my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with); |  | ||||||
| 
 |  | ||||||
|     printf STDERR "removing file `%s'\n", $less_linked->{name} |  | ||||||
|         if $opts{v}; |         if $opts{v}; | ||||||
| 
 | 
 | ||||||
|     unless (unlink $less_linked->{name}) { |     &find(sub { | ||||||
|         printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n", |         # outright skip directories (don't report skip) | ||||||
|             $less_linked->{name}, |         return if -d $File::Find::name; | ||||||
|             $!, |  | ||||||
|             $more_linked->{name}; |  | ||||||
| 
 | 
 | ||||||
|         # if we can't do this, there's no point in continuing |         # limit to or exclude file patterns specified by `-m' or `-M', | ||||||
|         unless (unlink $more_linked->{name}) { |         # respectively | ||||||
|             printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n", |  | ||||||
|                 $more_linked->{name}, |  | ||||||
|                 $!; |  | ||||||
| 
 | 
 | ||||||
|  |         # truth table | ||||||
|  |         # -m matches    | -M is used & matches  | !return? | ||||||
|  |         # 0     | 0         | 0 | ||||||
|  |         # 0     | 1         | 0 | ||||||
|  |         # 1     | 0         | 1 | ||||||
|  |         # 1     | 1         | 0 | ||||||
|  |         # note: m// will match everything | ||||||
|  |         unless ($File::Find::name =~ m/$opts{m}/ and | ||||||
|  |             !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) { | ||||||
|  | 
 | ||||||
|  |             print STDERR "Skipping path `$File::Find::name'\n" | ||||||
|  |                 if $opts{v}; | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         # the ol' switcheroo |         # skip non-existent files and links | ||||||
|         ($more_linked, $less_linked) = |         unless (-f $File::Find::name && ! -l $File::Find::name) { | ||||||
|         ($less_linked, $more_linked); |             return; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         push @files, $File::Find::name; | ||||||
|  |     }, @dirs_to_process); | ||||||
|  | 
 | ||||||
|  |     printf STDERR "%s files found.\nGenerating hashes...", scalar @files | ||||||
|  |         if $opts{v}; | ||||||
|  | 
 | ||||||
|  |     my $filehash = Directory::Simplify::FileHash->new; | ||||||
|  |     $filehash->add(@files); | ||||||
|  |     print STDERR "done.\n" | ||||||
|  |         if $opts{v}; | ||||||
|  | 
 | ||||||
|  |     my $generator = Directory::Simplify::Instruction::Generator->new( | ||||||
|  |         filehash => $filehash, | ||||||
|  |         min_size => ($opts{z} ? 0 : 1), | ||||||
|  |     ); | ||||||
|  | 
 | ||||||
|  |     my $freed_bytes = 0; | ||||||
|  | 
 | ||||||
|  |     foreach my $inst ($generator->instructions) { | ||||||
|  |         print STDERR $inst, "\n" if $opts{v}; | ||||||
|  |         $inst->run; | ||||||
|  |         $freed_bytes += $inst->bytes_freed; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     # we unlinked it or failed out |     printf STDERR "freed %d bytes (%0.4G %s)\n", | ||||||
|     $freed_bytes += $less_linked->{size} |         $freed_bytes, | ||||||
|         unless $less_linked->{nlink} > 1; |         Directory::Simplify::Utils::hr_size($freed_bytes) | ||||||
| 
 |             if $opts{f} or $opts{v}; | ||||||
|     printf STDERR "hard linking `%s' => `%s'\n", |  | ||||||
|         $less_linked->{name}, $more_linked->{name} |  | ||||||
|     if $opts{v}; |  | ||||||
| 
 |  | ||||||
|     # hard link the files |  | ||||||
|     link $more_linked->{name}, |  | ||||||
|     $less_linked->{name}; |  | ||||||
| 
 |  | ||||||
|     # preserve older time stamp |  | ||||||
|     utime $atime, $mtime, $more_linked->{name}; |  | ||||||
|     $more_linked->{atime} = $atime; |  | ||||||
|     $more_linked->{mtime} = $mtime; |  | ||||||
| 
 |  | ||||||
|     # update link count in our hash to reflect the file system (referenced) |  | ||||||
|     ++$more_linked->{nlink}; |  | ||||||
| 
 |  | ||||||
|     # update old entry with the info from the new one |  | ||||||
|     foreach my $copy_attr (qw/ |  | ||||||
|         ino |  | ||||||
|         nlink |  | ||||||
|         mode |  | ||||||
|         uid |  | ||||||
|         gid |  | ||||||
|         atime |  | ||||||
|         mtime |  | ||||||
|         ctime |  | ||||||
|     /) { |  | ||||||
|         $less_linked->{$copy_attr} = $more_linked->{$copy_attr}; |  | ||||||
|     } |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | package Directory::Simplify::Utils; | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  | 
 | ||||||
| sub hr_size { | sub hr_size { | ||||||
|     my $sz = shift; |     my $sz = shift; | ||||||
|     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; |     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; | ||||||
|  | @ -272,137 +219,148 @@ sub hr_size { | ||||||
|     wantarray ? @ret : "@ret" |     wantarray ? @ret : "@ret" | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| package Directory::Simplify::FileHash; | sub shell_quote { | ||||||
|  |     # shell-escape argument for inclusion in non-interpolated single quotes | ||||||
|  |     my @transformed = map { | ||||||
|  |         (my $out = $_) | ||||||
|  |             =~ s/'/'\\''/g; | ||||||
|  |         "'$out'"; | ||||||
|  |     } @_; | ||||||
|  |     wantarray ? @transformed : $transformed[0]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | package Directory::Simplify::Instruction::Hardlink; | ||||||
| use strict; | use strict; | ||||||
| use warnings; | use warnings; | ||||||
| 
 | use overload '""' => 'as_string'; | ||||||
| =head1 DESCRIPTION |  | ||||||
| 
 |  | ||||||
| Object for abstracting management of a hashed filesystem |  | ||||||
| 
 |  | ||||||
| =cut |  | ||||||
| 
 | 
 | ||||||
| sub new { | sub new { | ||||||
|     my ($class, $self) = (shift, {}); |     my $class = shift; | ||||||
| 
 |     return bless { | ||||||
|     $self->{_files} = {}; |         freed => 0, | ||||||
| 
 |         @_, | ||||||
|     require Digest::SHA; |     }, $class; | ||||||
|     $self->{_ctx} = Digest::SHA->new; |  | ||||||
| 
 |  | ||||||
|     # default options |  | ||||||
|     $self->{_min_linkable_size} = 1; |  | ||||||
| 
 |  | ||||||
|     bless $self, $class |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| =head2 min_linkable_size($bytes) | sub run { | ||||||
| 
 |  | ||||||
| Set or get the minimum size of files to be considered hard-linkable. Default is 1. |  | ||||||
| 
 |  | ||||||
| =cut |  | ||||||
| 
 |  | ||||||
| sub min_linkable_size { |  | ||||||
|     my $self = shift; |     my $self = shift; | ||||||
|     my $in = shift; |     # hard link the files | ||||||
|     if (defined $in) { | 
 | ||||||
|         $self->{_min_linkable_size} = $in; |     unless (unlink $self->{target}->{name}) { | ||||||
|  |         die "Failed to remove file `$self->{target}->{name}': $!\n"; | ||||||
|     } |     } | ||||||
|     $self->{_min_linkable_size} |     unless (link $self->{source}->{name}, $self->{target}->{name}) { | ||||||
| } |         die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!"; | ||||||
| 
 |     } | ||||||
| =head2 make_entry($filename) |     # bookkeeping | ||||||
| 
 |     ++$self->{source}->{nlink}; | ||||||
| =cut |     if (--$self->{target}->{nlink} == 0) { | ||||||
| 
 |         $self->{freed} = $self->{target}->{size}; | ||||||
| sub make_entry { |  | ||||||
|     my $self = shift; |  | ||||||
|     my ($filename) = @_; |  | ||||||
|     # organize results from lstat into hash |  | ||||||
|     my $entry = {}; |  | ||||||
|     (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size |  | ||||||
|             atime mtime ctime blksize blocks /}) |  | ||||||
|         = ($filename, lstat $filename); |  | ||||||
| 
 |  | ||||||
|     $entry->{hash} = $self->_hash($filename); |  | ||||||
| 
 |  | ||||||
|     $entry |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| =head2 add_entry($entry) |  | ||||||
| 
 |  | ||||||
| =cut |  | ||||||
| 
 |  | ||||||
| sub add_entry { |  | ||||||
|     my $self = shift; |  | ||||||
|     my ($entry) = @_; |  | ||||||
| 
 |  | ||||||
|     my $hash = $entry->{hash}; |  | ||||||
| 
 |  | ||||||
|     unless (defined $self->{_files}->{$hash}) { |  | ||||||
|         $self->{_files}->{$hash} = []; |  | ||||||
|     } |     } | ||||||
|     push @{$self->{_files}->{$hash}}, $entry; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| sub find_hardlinkable { | sub bytes_freed { | ||||||
|     my $self = shift; |     my $self = shift; | ||||||
|     my ($entry) = @_; |     return $self->{freed}; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
|     my $hash = $entry->{hash}; | sub as_string { | ||||||
|  |     my $self = shift; | ||||||
|  |     return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
|     # no matching entries | package Directory::Simplify::Instruction::CopyTimestamp; | ||||||
|     unless (defined $self->{_files}->{$hash}) { | use strict; | ||||||
|         return (); | use warnings; | ||||||
|     } | use overload '""' => 'as_string'; | ||||||
| 
 | 
 | ||||||
|     my @matches; | sub new { | ||||||
|     foreach my $ent (@{$self->{_files}->{$hash}}) { |     my $class = shift; | ||||||
|         if ($self->_entries_are_hard_linkable($entry, $ent)) { |     return bless { | ||||||
|             push @matches, $ent; |         @_, | ||||||
|  |     }, $class; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub run { | ||||||
|  |     my $self = shift; | ||||||
|  |     # preserve older time stamp | ||||||
|  |     utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name}; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub bytes_freed { | ||||||
|  |     return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub as_string { | ||||||
|  |     my $self = shift; | ||||||
|  |     return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | package Directory::Simplify::Instruction::Generator; | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  | use overload '""' => 'as_string'; | ||||||
|  | use File::Basename qw/ dirname /; | ||||||
|  | use File::Compare qw/ compare /; | ||||||
|  | 
 | ||||||
|  | sub new { | ||||||
|  |     my $class = shift; | ||||||
|  |     return bless { | ||||||
|  |         filehash => undef, | ||||||
|  |         min_size => 1, | ||||||
|  |         @_, | ||||||
|  |     }, $class; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub as_string { | ||||||
|  |     my $self = shift; | ||||||
|  |     join "\n", $self->instructions; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub buckets { | ||||||
|  |     my $self = shift; | ||||||
|  | 
 | ||||||
|  |     my @candidate_lists = $self->{filehash}->entries; | ||||||
|  | 
 | ||||||
|  |     my @buckets; | ||||||
|  |     foreach my $candidates (@candidate_lists) { | ||||||
|  |         my @ca = @{$candidates}; # make a clone | ||||||
|  |         my @these_buckets; | ||||||
|  | 
 | ||||||
|  |         # at least two files needed to link together | ||||||
|  |         if (@ca > 1) { | ||||||
|  |             ELIMINATOR: while (@ca) { | ||||||
|  |                 my $entry = shift @ca; | ||||||
|  | 
 | ||||||
|  |                 next ELIMINATOR if $self->_entry_should_be_skipped($entry); | ||||||
|  | 
 | ||||||
|  |                 foreach my $bucket_idx (0 .. $#these_buckets) { | ||||||
|  |                     if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { | ||||||
|  |                         push @{$these_buckets[$bucket_idx]}, $entry; | ||||||
|  |                         next ELIMINATOR; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 # didn't find a bucket (even though the hash matched!) | ||||||
|  |                 # -> new bucket | ||||||
|  |                 push @these_buckets, [$entry]; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |         push @buckets, @these_buckets; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     return @matches; |     @buckets | ||||||
| 
 |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| =head2 oldest($entry_a, $entry_b, ...) | sub _entry_should_be_skipped { | ||||||
| 
 |     my ($self, $entry_a) = @_; | ||||||
| Find the file less embedded in the file system. |     # too small to be hard-linked | ||||||
| 
 |     if ($entry_a->{size} < $self->{min_size}) { | ||||||
| =cut |         return 1; | ||||||
| 
 |     } | ||||||
| sub less_linked { |     return 0; | ||||||
|     my $self = shift; |  | ||||||
|     return sort |  | ||||||
|         {$a->{nlink} <=> $b->{nlink}} |  | ||||||
|         @_; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| =head2 oldest($entry_a, $entry_b, ...) |  | ||||||
| 
 |  | ||||||
| Find the entry with the oldest time stamp. |  | ||||||
| 
 |  | ||||||
| =cut |  | ||||||
| 
 |  | ||||||
| sub oldest_mtime { |  | ||||||
|     my $self = shift; |  | ||||||
| 
 |  | ||||||
|     return sort |  | ||||||
|         {$a->{mtime} <=> $b->{mtime}} |  | ||||||
|         @_; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| sub _hash { |  | ||||||
|     my $self = shift; |  | ||||||
|     my ($filename) = @_; |  | ||||||
|     $self->{_ctx}->addfile($filename); |  | ||||||
|     return $self->{_ctx}->hexdigest; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| sub _entries_are_hard_linkable { | sub _entries_are_hard_linkable { | ||||||
|     my $self = shift; |  | ||||||
|     my ($entry_a, $entry_b) = @_; |     my ($entry_a, $entry_b) = @_; | ||||||
| 
 | 
 | ||||||
|     # obviously, if the sizes aren't the same, they're not the same file |     # obviously, if the sizes aren't the same, they're not the same file | ||||||
|  | @ -410,15 +368,7 @@ sub _entries_are_hard_linkable { | ||||||
|         return 0; |         return 0; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     # too small to be hard-linked |  | ||||||
|     if ($entry_a->{size} < $self->min_linkable_size) { |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     # they're the same file, don't try it |     # they're the same file, don't try it | ||||||
|     if (&_entries_are_same_filename($entry_a, $entry_b)) { |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
|     if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { |     if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { | ||||||
|         return 0; |         return 0; | ||||||
|     } |     } | ||||||
|  | @ -429,13 +379,26 @@ sub _entries_are_hard_linkable { | ||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| sub _entries_are_same_filename { | sub oldest_mtime { | ||||||
|     my ($entry_a, $entry_b) = @_; |     my $self = shift; | ||||||
|  |     return sort { | ||||||
|  |         $a->{mtime} <=> $b->{mtime} | ||||||
|  |     } @_; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
|     if ($entry_a->{name} eq $entry_b->{name}) { | sub more_linked { | ||||||
|         return 1; |     my $self = shift; | ||||||
|     } |     my %warned; | ||||||
|     return 0; |     return sort { | ||||||
|  |         if (! -w &dirname($a->{name})) { | ||||||
|  |             warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++; | ||||||
|  |             return 1; # favor a -> front | ||||||
|  |         } elsif (! -w &dirname($b->{name})) { | ||||||
|  |             warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++; | ||||||
|  |             return -1; # favor b -> front | ||||||
|  |         } | ||||||
|  |         $b->{nlink} <=> $a->{nlink} | ||||||
|  |     } @_; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| sub _entries_are_already_hard_linked { | sub _entries_are_already_hard_linked { | ||||||
|  | @ -456,17 +419,10 @@ sub _entries_sizes_match { | ||||||
|     } |     } | ||||||
|     return 1; |     return 1; | ||||||
| } | } | ||||||
| 
 |  | ||||||
| sub _entries_contents_match { | sub _entries_contents_match { | ||||||
|     my ($entry_a, $entry_b) = @_; |     my ($entry_a, $entry_b) = @_; | ||||||
| 
 | 
 | ||||||
|     # also, if the hashes aren't the same, they cannot be the same file |     my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name})); | ||||||
|     if ($entry_a->{hash} ne $entry_b->{hash}) { |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     use File::Compare   qw/ compare /; |  | ||||||
|     my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name})); |  | ||||||
| 
 | 
 | ||||||
|     # warn about hash collision |     # warn about hash collision | ||||||
|     unless ($contents_same) { |     unless ($contents_same) { | ||||||
|  | @ -474,3 +430,92 @@ sub _entries_contents_match { | ||||||
|     } |     } | ||||||
|     return $contents_same; |     return $contents_same; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | # generate hardlink instructions | ||||||
|  | sub instructions { | ||||||
|  |     require Scalar::Util; | ||||||
|  |     my $self = shift; | ||||||
|  | 
 | ||||||
|  |     # start generating instructions | ||||||
|  |     my @inst; | ||||||
|  |     foreach my $bucket ($self->buckets) { | ||||||
|  | 
 | ||||||
|  |         # of the bucket, find the oldest timestamp | ||||||
|  |         my ($oldest_entry) = $self->oldest_mtime(@{$bucket}); | ||||||
|  | 
 | ||||||
|  |         # of the bucket, find the file most embedded in the file system | ||||||
|  |         my @to_link = $self->more_linked(@{$bucket}); | ||||||
|  |         my $most_linked_entry = shift @to_link; | ||||||
|  |         foreach my $entry (@to_link) { | ||||||
|  |             # XXX there shouldn't be a need to update entries' link counts, | ||||||
|  |             # since this generates all the instructions at once | ||||||
|  |             push @inst, Directory::Simplify::Instruction::Hardlink->new( | ||||||
|  |                 source => $most_linked_entry, | ||||||
|  |                 target => $entry, | ||||||
|  |             ); | ||||||
|  |             push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( | ||||||
|  |                 source => $oldest_entry, | ||||||
|  |                 target => $entry, | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |         if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) { | ||||||
|  |             # most_linked_entry should get its timestamp updated | ||||||
|  |             push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( | ||||||
|  |                 source => $oldest_entry, | ||||||
|  |                 target => $most_linked_entry, | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     @inst | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | package Directory::Simplify::FileHash; | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  | require Digest::SHA; | ||||||
|  | 
 | ||||||
|  | =head1 DESCRIPTION | ||||||
|  | 
 | ||||||
|  | Object for abstracting management of a hashed filesystem | ||||||
|  | 
 | ||||||
|  | =cut | ||||||
|  | 
 | ||||||
|  | sub new { | ||||||
|  |     my $class = shift; | ||||||
|  |     return bless { | ||||||
|  |         _entries => {}, | ||||||
|  |         _files_in_hash => {}, | ||||||
|  |         @_, | ||||||
|  |     }, $class; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub add { | ||||||
|  |     require Cwd; | ||||||
|  |     my $self = shift; | ||||||
|  |     my $ctx = $self->{_ctx}; | ||||||
|  |     unless (defined $ctx) { | ||||||
|  |         $ctx = $self->{_ctx} = Digest::SHA->new; | ||||||
|  |     } | ||||||
|  |     foreach my $filename (@_) { | ||||||
|  |         $filename = Cwd::abs_path($filename); | ||||||
|  |         unless ($self->{_files_in_hash}->{$filename}) { | ||||||
|  |             my $entry = {}; | ||||||
|  |             (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size | ||||||
|  |                 atime mtime ctime blksize blocks /}) | ||||||
|  |             = ($filename, lstat $filename); | ||||||
|  | 
 | ||||||
|  |             $ctx->addfile($filename); | ||||||
|  |             my $hash = $ctx->hexdigest; | ||||||
|  |             unless (defined $self->{_entries}->{$hash}) { | ||||||
|  |                 $self->{_entries}->{$hash} = []; | ||||||
|  |             } | ||||||
|  |             push @{$self->{_entries}->{$hash}}, $entry; | ||||||
|  |         } | ||||||
|  |         $self->{_files_in_hash}->{$filename} = 1; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | sub entries { | ||||||
|  |     my $self = shift; | ||||||
|  |     values %{$self->{_entries}} | ||||||
|  | } | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue