From c77fc7a205bcaebdc51acb689f0fd50c376e93b6 Mon Sep 17 00:00:00 2001 From: Dan Church Date: Sun, 21 Jan 2018 15:48:49 -0600 Subject: [PATCH] Update script Follows a more logical process of deciding what to do, then doing it. --- CHANGELOG.md | 6 + simplify_static_dir.pl | 551 ++++++++++++++++++++++------------------- 2 files changed, 304 insertions(+), 253 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 126185e..0d276f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [3.0.0] + +### Changed + +- Re-order operations to make the linking happen at the very end + ## [2.0.0] ### Fixed diff --git a/simplify_static_dir.pl b/simplify_static_dir.pl index 83582d7..19c5482 100755 --- a/simplify_static_dir.pl +++ b/simplify_static_dir.pl @@ -1,9 +1,12 @@ #!/usr/bin/perl # vi: et sts=4 sw=4 ts=4 + +package main; use strict; use warnings; +require Cwd; -our $VERSION = '2.0.0'; +our $VERSION = '3.0.0'; =pod @@ -117,142 +120,86 @@ EOF exit 0; } -my %opts = ( - v => 0, - f => 0, - m => '', - M => '', - z => 0, -); +MAIN: { + my %opts = ( + v => 0, + f => 0, + m => '', + M => '', + z => 0, + ); -&getopts('vfm:M:z', \%opts); + &getopts('vfm:M:z', \%opts); -my $filehash = new Directory::Simplify::FileHash; + # correct relative paths + # OR if no directories given, search the current directory + my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD})); -# include zero-length files if wanted (`-z') -$filehash->min_linkable_size(0) - if $opts{z}; - -# correct relative paths -# OR if no directories given, search the current directory -push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV; - -my $freed_bytes = 0; - -&find(\&findexec, @ARGV); - -printf STDERR "freed %d bytes (%0.4G %s)\n", - $freed_bytes, &hr_size($freed_bytes) - if $opts{f} or $opts{v}; - -sub findexec { - # outright skip directories (don't report skip) - return if -d $File::Find::name; - - # limit to or exclude file patterns specified by `-m' or `-M', - # respectively - - # truth table - # -m matches | -M is used & matches | !return? - # 0 | 0 | 0 - # 0 | 1 | 0 - # 1 | 0 | 1 - # 1 | 1 | 0 - # note: m// will match everything - unless ($File::Find::name =~ m/$opts{m}/ and - !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) { - - print STDERR "Skipping path `$File::Find::name'\n" - if $opts{v}; - return; - } - - # skip non-existent files and links - unless (-f $File::Find::name && ! -l $File::Find::name) { - return; - } - - my $entry = $filehash->make_entry($File::Find::name); - - my @linkable = $filehash->find_hardlinkable($entry); - if (@linkable) { - &hardlink_entries($entry, @linkable); - } - $filehash->add_entry($entry); -} - -sub hardlink_entries { - my ($entry, @linkable) = @_; - - # only one of the linkable entries should suffice - my $linking_with = $linkable[0]; - - # calculate the timestamp of the resulting file - my ($atime, $mtime) = @{( - $filehash->oldest_mtime($entry, $linking_with) - )[0]}{qw/ atime mtime /}; - - # find the file less embedded in the file system - my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with); - - printf STDERR "removing file `%s'\n", $less_linked->{name} + my @files; + print STDERR 'Finding files...' if $opts{v}; - unless (unlink $less_linked->{name}) { - printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n", - $less_linked->{name}, - $!, - $more_linked->{name}; + &find(sub { + # outright skip directories (don't report skip) + return if -d $File::Find::name; - # if we can't do this, there's no point in continuing - unless (unlink $more_linked->{name}) { - printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n", - $more_linked->{name}, - $!; + # limit to or exclude file patterns specified by `-m' or `-M', + # respectively + # truth table + # -m matches | -M is used & matches | !return? + # 0 | 0 | 0 + # 0 | 1 | 0 + # 1 | 0 | 1 + # 1 | 1 | 0 + # note: m// will match everything + unless ($File::Find::name =~ m/$opts{m}/ and + !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) { + + print STDERR "Skipping path `$File::Find::name'\n" + if $opts{v}; return; } - # the ol' switcheroo - ($more_linked, $less_linked) = - ($less_linked, $more_linked); + # skip non-existent files and links + unless (-f $File::Find::name && ! -l $File::Find::name) { + return; + } + + push @files, $File::Find::name; + }, @dirs_to_process); + + printf STDERR "%s files found.\nGenerating hashes...", scalar @files + if $opts{v}; + + my $filehash = Directory::Simplify::FileHash->new; + $filehash->add(@files); + print STDERR "done.\n" + if $opts{v}; + + my $generator = Directory::Simplify::Instruction::Generator->new( + filehash => $filehash, + min_size => ($opts{z} ? 0 : 1), + ); + + my $freed_bytes = 0; + + foreach my $inst ($generator->instructions) { + print STDERR $inst, "\n" if $opts{v}; + $inst->run; + $freed_bytes += $inst->bytes_freed; } - # we unlinked it or failed out - $freed_bytes += $less_linked->{size} - unless $less_linked->{nlink} > 1; - - printf STDERR "hard linking `%s' => `%s'\n", - $less_linked->{name}, $more_linked->{name} - if $opts{v}; - - # hard link the files - link $more_linked->{name}, - $less_linked->{name}; - - # preserve older time stamp - utime $atime, $mtime, $more_linked->{name}; - $more_linked->{atime} = $atime; - $more_linked->{mtime} = $mtime; - - # update link count in our hash to reflect the file system (referenced) - ++$more_linked->{nlink}; - - # update old entry with the info from the new one - foreach my $copy_attr (qw/ - ino - nlink - mode - uid - gid - atime - mtime - ctime - /) { - $less_linked->{$copy_attr} = $more_linked->{$copy_attr}; - } + printf STDERR "freed %d bytes (%0.4G %s)\n", + $freed_bytes, + Directory::Simplify::Utils::hr_size($freed_bytes) + if $opts{f} or $opts{v}; } +package Directory::Simplify::Utils; +use strict; +use warnings; + sub hr_size { my $sz = shift; my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; @@ -272,137 +219,148 @@ sub hr_size { wantarray ? @ret : "@ret" } -package Directory::Simplify::FileHash; +sub shell_quote { + # shell-escape argument for inclusion in non-interpolated single quotes + my @transformed = map { + (my $out = $_) + =~ s/'/'\\''/g; + "'$out'"; + } @_; + wantarray ? @transformed : $transformed[0]; +} + +package Directory::Simplify::Instruction::Hardlink; use strict; use warnings; - -=head1 DESCRIPTION - -Object for abstracting management of a hashed filesystem - -=cut +use overload '""' => 'as_string'; sub new { - my ($class, $self) = (shift, {}); - - $self->{_files} = {}; - - require Digest::SHA; - $self->{_ctx} = Digest::SHA->new; - - # default options - $self->{_min_linkable_size} = 1; - - bless $self, $class + my $class = shift; + return bless { + freed => 0, + @_, + }, $class; } -=head2 min_linkable_size($bytes) - -Set or get the minimum size of files to be considered hard-linkable. Default is 1. - -=cut - -sub min_linkable_size { +sub run { my $self = shift; - my $in = shift; - if (defined $in) { - $self->{_min_linkable_size} = $in; + # hard link the files + + unless (unlink $self->{target}->{name}) { + die "Failed to remove file `$self->{target}->{name}': $!\n"; } - $self->{_min_linkable_size} -} - -=head2 make_entry($filename) - -=cut - -sub make_entry { - my $self = shift; - my ($filename) = @_; - # organize results from lstat into hash - my $entry = {}; - (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size - atime mtime ctime blksize blocks /}) - = ($filename, lstat $filename); - - $entry->{hash} = $self->_hash($filename); - - $entry -} - -=head2 add_entry($entry) - -=cut - -sub add_entry { - my $self = shift; - my ($entry) = @_; - - my $hash = $entry->{hash}; - - unless (defined $self->{_files}->{$hash}) { - $self->{_files}->{$hash} = []; + unless (link $self->{source}->{name}, $self->{target}->{name}) { + die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!"; + } + # bookkeeping + ++$self->{source}->{nlink}; + if (--$self->{target}->{nlink} == 0) { + $self->{freed} = $self->{target}->{size}; } - push @{$self->{_files}->{$hash}}, $entry; } -sub find_hardlinkable { +sub bytes_freed { my $self = shift; - my ($entry) = @_; + return $self->{freed}; +} - my $hash = $entry->{hash}; +sub as_string { + my $self = shift; + return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); +} - # no matching entries - unless (defined $self->{_files}->{$hash}) { - return (); - } +package Directory::Simplify::Instruction::CopyTimestamp; +use strict; +use warnings; +use overload '""' => 'as_string'; - my @matches; - foreach my $ent (@{$self->{_files}->{$hash}}) { - if ($self->_entries_are_hard_linkable($entry, $ent)) { - push @matches, $ent; +sub new { + my $class = shift; + return bless { + @_, + }, $class; +} + +sub run { + my $self = shift; + # preserve older time stamp + utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name}; +} + +sub bytes_freed { + return 0; +} + +sub as_string { + my $self = shift; + return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); +} + +package Directory::Simplify::Instruction::Generator; +use strict; +use warnings; +use overload '""' => 'as_string'; +use File::Basename qw/ dirname /; +use File::Compare qw/ compare /; + +sub new { + my $class = shift; + return bless { + filehash => undef, + min_size => 1, + @_, + }, $class; +} + +sub as_string { + my $self = shift; + join "\n", $self->instructions; +} + +sub buckets { + my $self = shift; + + my @candidate_lists = $self->{filehash}->entries; + + my @buckets; + foreach my $candidates (@candidate_lists) { + my @ca = @{$candidates}; # make a clone + my @these_buckets; + + # at least two files needed to link together + if (@ca > 1) { + ELIMINATOR: while (@ca) { + my $entry = shift @ca; + + next ELIMINATOR if $self->_entry_should_be_skipped($entry); + + foreach my $bucket_idx (0 .. $#these_buckets) { + if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { + push @{$these_buckets[$bucket_idx]}, $entry; + next ELIMINATOR; + } + } + # didn't find a bucket (even though the hash matched!) + # -> new bucket + push @these_buckets, [$entry]; + } } + push @buckets, @these_buckets; } - return @matches; - + @buckets } -=head2 oldest($entry_a, $entry_b, ...) - -Find the file less embedded in the file system. - -=cut - -sub less_linked { - my $self = shift; - return sort - {$a->{nlink} <=> $b->{nlink}} - @_; -} - -=head2 oldest($entry_a, $entry_b, ...) - -Find the entry with the oldest time stamp. - -=cut - -sub oldest_mtime { - my $self = shift; - - return sort - {$a->{mtime} <=> $b->{mtime}} - @_; -} - -sub _hash { - my $self = shift; - my ($filename) = @_; - $self->{_ctx}->addfile($filename); - return $self->{_ctx}->hexdigest; +sub _entry_should_be_skipped { + my ($self, $entry_a) = @_; + # too small to be hard-linked + if ($entry_a->{size} < $self->{min_size}) { + return 1; + } + return 0; } sub _entries_are_hard_linkable { - my $self = shift; my ($entry_a, $entry_b) = @_; # obviously, if the sizes aren't the same, they're not the same file @@ -410,15 +368,7 @@ sub _entries_are_hard_linkable { return 0; } - # too small to be hard-linked - if ($entry_a->{size} < $self->min_linkable_size) { - return 0; - } - # they're the same file, don't try it - if (&_entries_are_same_filename($entry_a, $entry_b)) { - return 0; - } if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { return 0; } @@ -429,13 +379,26 @@ sub _entries_are_hard_linkable { return 0; } -sub _entries_are_same_filename { - my ($entry_a, $entry_b) = @_; +sub oldest_mtime { + my $self = shift; + return sort { + $a->{mtime} <=> $b->{mtime} + } @_; +} - if ($entry_a->{name} eq $entry_b->{name}) { - return 1; - } - return 0; +sub more_linked { + my $self = shift; + my %warned; + return sort { + if (! -w &dirname($a->{name})) { + warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++; + return 1; # favor a -> front + } elsif (! -w &dirname($b->{name})) { + warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++; + return -1; # favor b -> front + } + $b->{nlink} <=> $a->{nlink} + } @_; } sub _entries_are_already_hard_linked { @@ -456,17 +419,10 @@ sub _entries_sizes_match { } return 1; } - sub _entries_contents_match { my ($entry_a, $entry_b) = @_; - # also, if the hashes aren't the same, they cannot be the same file - if ($entry_a->{hash} ne $entry_b->{hash}) { - return 0; - } - - use File::Compare qw/ compare /; - my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name})); + my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name})); # warn about hash collision unless ($contents_same) { @@ -474,3 +430,92 @@ sub _entries_contents_match { } return $contents_same; } + +# generate hardlink instructions +sub instructions { + require Scalar::Util; + my $self = shift; + + # start generating instructions + my @inst; + foreach my $bucket ($self->buckets) { + + # of the bucket, find the oldest timestamp + my ($oldest_entry) = $self->oldest_mtime(@{$bucket}); + + # of the bucket, find the file most embedded in the file system + my @to_link = $self->more_linked(@{$bucket}); + my $most_linked_entry = shift @to_link; + foreach my $entry (@to_link) { + # XXX there shouldn't be a need to update entries' link counts, + # since this generates all the instructions at once + push @inst, Directory::Simplify::Instruction::Hardlink->new( + source => $most_linked_entry, + target => $entry, + ); + push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( + source => $oldest_entry, + target => $entry, + ); + } + if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) { + # most_linked_entry should get its timestamp updated + push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( + source => $oldest_entry, + target => $most_linked_entry, + ); + } + } + @inst +} + +package Directory::Simplify::FileHash; +use strict; +use warnings; +require Digest::SHA; + +=head1 DESCRIPTION + +Object for abstracting management of a hashed filesystem + +=cut + +sub new { + my $class = shift; + return bless { + _entries => {}, + _files_in_hash => {}, + @_, + }, $class; +} + +sub add { + require Cwd; + my $self = shift; + my $ctx = $self->{_ctx}; + unless (defined $ctx) { + $ctx = $self->{_ctx} = Digest::SHA->new; + } + foreach my $filename (@_) { + $filename = Cwd::abs_path($filename); + unless ($self->{_files_in_hash}->{$filename}) { + my $entry = {}; + (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size + atime mtime ctime blksize blocks /}) + = ($filename, lstat $filename); + + $ctx->addfile($filename); + my $hash = $ctx->hexdigest; + unless (defined $self->{_entries}->{$hash}) { + $self->{_entries}->{$hash} = []; + } + push @{$self->{_entries}->{$hash}}, $entry; + } + $self->{_files_in_hash}->{$filename} = 1; + } +} + +sub entries { + my $self = shift; + values %{$self->{_entries}} +}