Update script

Follows a more logical process of deciding what to do, then doing it.
This commit is contained in:
Dan Church 2018-01-21 15:48:49 -06:00
parent 7d19ccd876
commit c77fc7a205
Signed by: h3xx
GPG Key ID: EA2BF379CD2CDBD0
2 changed files with 304 additions and 253 deletions

View File

@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
## [Unreleased] ## [Unreleased]
## [3.0.0]
### Changed
- Re-order operations to make the linking happen at the very end
## [2.0.0] ## [2.0.0]
### Fixed ### Fixed

View File

@ -1,9 +1,12 @@
#!/usr/bin/perl #!/usr/bin/perl
# vi: et sts=4 sw=4 ts=4 # vi: et sts=4 sw=4 ts=4
package main;
use strict; use strict;
use warnings; use warnings;
require Cwd;
our $VERSION = '2.0.0'; our $VERSION = '3.0.0';
=pod =pod
@ -117,142 +120,86 @@ EOF
exit 0; exit 0;
} }
my %opts = ( MAIN: {
v => 0, my %opts = (
f => 0, v => 0,
m => '', f => 0,
M => '', m => '',
z => 0, M => '',
); z => 0,
);
&getopts('vfm:M:z', \%opts); &getopts('vfm:M:z', \%opts);
my $filehash = new Directory::Simplify::FileHash; # correct relative paths
# OR if no directories given, search the current directory
my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));
# include zero-length files if wanted (`-z') my @files;
$filehash->min_linkable_size(0) print STDERR 'Finding files...'
if $opts{z};
# correct relative paths
# OR if no directories given, search the current directory
push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV;
my $freed_bytes = 0;
&find(\&findexec, @ARGV);
printf STDERR "freed %d bytes (%0.4G %s)\n",
$freed_bytes, &hr_size($freed_bytes)
if $opts{f} or $opts{v};
sub findexec {
# outright skip directories (don't report skip)
return if -d $File::Find::name;
# limit to or exclude file patterns specified by `-m' or `-M',
# respectively
# truth table
# -m matches | -M is used & matches | !return?
# 0 | 0 | 0
# 0 | 1 | 0
# 1 | 0 | 1
# 1 | 1 | 0
# note: m// will match everything
unless ($File::Find::name =~ m/$opts{m}/ and
!(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) {
print STDERR "Skipping path `$File::Find::name'\n"
if $opts{v};
return;
}
# skip non-existent files and links
unless (-f $File::Find::name && ! -l $File::Find::name) {
return;
}
my $entry = $filehash->make_entry($File::Find::name);
my @linkable = $filehash->find_hardlinkable($entry);
if (@linkable) {
&hardlink_entries($entry, @linkable);
}
$filehash->add_entry($entry);
}
sub hardlink_entries {
my ($entry, @linkable) = @_;
# only one of the linkable entries should suffice
my $linking_with = $linkable[0];
# calculate the timestamp of the resulting file
my ($atime, $mtime) = @{(
$filehash->oldest_mtime($entry, $linking_with)
)[0]}{qw/ atime mtime /};
# find the file less embedded in the file system
my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with);
printf STDERR "removing file `%s'\n", $less_linked->{name}
if $opts{v}; if $opts{v};
unless (unlink $less_linked->{name}) { &find(sub {
printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n", # outright skip directories (don't report skip)
$less_linked->{name}, return if -d $File::Find::name;
$!,
$more_linked->{name};
# if we can't do this, there's no point in continuing # limit to or exclude file patterns specified by `-m' or `-M',
unless (unlink $more_linked->{name}) { # respectively
printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n",
$more_linked->{name},
$!;
# truth table
# -m matches | -M is used & matches | !return?
# 0 | 0 | 0
# 0 | 1 | 0
# 1 | 0 | 1
# 1 | 1 | 0
# note: m// will match everything
unless ($File::Find::name =~ m/$opts{m}/ and
!(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) {
print STDERR "Skipping path `$File::Find::name'\n"
if $opts{v};
return; return;
} }
# the ol' switcheroo # skip non-existent files and links
($more_linked, $less_linked) = unless (-f $File::Find::name && ! -l $File::Find::name) {
($less_linked, $more_linked); return;
}
push @files, $File::Find::name;
}, @dirs_to_process);
printf STDERR "%s files found.\nGenerating hashes...", scalar @files
if $opts{v};
my $filehash = Directory::Simplify::FileHash->new;
$filehash->add(@files);
print STDERR "done.\n"
if $opts{v};
my $generator = Directory::Simplify::Instruction::Generator->new(
filehash => $filehash,
min_size => ($opts{z} ? 0 : 1),
);
my $freed_bytes = 0;
foreach my $inst ($generator->instructions) {
print STDERR $inst, "\n" if $opts{v};
$inst->run;
$freed_bytes += $inst->bytes_freed;
} }
# we unlinked it or failed out printf STDERR "freed %d bytes (%0.4G %s)\n",
$freed_bytes += $less_linked->{size} $freed_bytes,
unless $less_linked->{nlink} > 1; Directory::Simplify::Utils::hr_size($freed_bytes)
if $opts{f} or $opts{v};
printf STDERR "hard linking `%s' => `%s'\n",
$less_linked->{name}, $more_linked->{name}
if $opts{v};
# hard link the files
link $more_linked->{name},
$less_linked->{name};
# preserve older time stamp
utime $atime, $mtime, $more_linked->{name};
$more_linked->{atime} = $atime;
$more_linked->{mtime} = $mtime;
# update link count in our hash to reflect the file system (referenced)
++$more_linked->{nlink};
# update old entry with the info from the new one
foreach my $copy_attr (qw/
ino
nlink
mode
uid
gid
atime
mtime
ctime
/) {
$less_linked->{$copy_attr} = $more_linked->{$copy_attr};
}
} }
package Directory::Simplify::Utils;
use strict;
use warnings;
sub hr_size { sub hr_size {
my $sz = shift; my $sz = shift;
my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
@ -272,137 +219,148 @@ sub hr_size {
wantarray ? @ret : "@ret" wantarray ? @ret : "@ret"
} }
package Directory::Simplify::FileHash; sub shell_quote {
# shell-escape argument for inclusion in non-interpolated single quotes
my @transformed = map {
(my $out = $_)
=~ s/'/'\\''/g;
"'$out'";
} @_;
wantarray ? @transformed : $transformed[0];
}
package Directory::Simplify::Instruction::Hardlink;
use strict; use strict;
use warnings; use warnings;
use overload '""' => 'as_string';
=head1 DESCRIPTION
Object for abstracting management of a hashed filesystem
=cut
sub new { sub new {
my ($class, $self) = (shift, {}); my $class = shift;
return bless {
$self->{_files} = {}; freed => 0,
@_,
require Digest::SHA; }, $class;
$self->{_ctx} = Digest::SHA->new;
# default options
$self->{_min_linkable_size} = 1;
bless $self, $class
} }
=head2 min_linkable_size($bytes) sub run {
Set or get the minimum size of files to be considered hard-linkable. Default is 1.
=cut
sub min_linkable_size {
my $self = shift; my $self = shift;
my $in = shift; # hard link the files
if (defined $in) {
$self->{_min_linkable_size} = $in; unless (unlink $self->{target}->{name}) {
die "Failed to remove file `$self->{target}->{name}': $!\n";
} }
$self->{_min_linkable_size} unless (link $self->{source}->{name}, $self->{target}->{name}) {
} die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!";
}
=head2 make_entry($filename) # bookkeeping
++$self->{source}->{nlink};
=cut if (--$self->{target}->{nlink} == 0) {
$self->{freed} = $self->{target}->{size};
sub make_entry {
my $self = shift;
my ($filename) = @_;
# organize results from lstat into hash
my $entry = {};
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
atime mtime ctime blksize blocks /})
= ($filename, lstat $filename);
$entry->{hash} = $self->_hash($filename);
$entry
}
=head2 add_entry($entry)
=cut
sub add_entry {
my $self = shift;
my ($entry) = @_;
my $hash = $entry->{hash};
unless (defined $self->{_files}->{$hash}) {
$self->{_files}->{$hash} = [];
} }
push @{$self->{_files}->{$hash}}, $entry;
} }
sub find_hardlinkable { sub bytes_freed {
my $self = shift; my $self = shift;
my ($entry) = @_; return $self->{freed};
}
my $hash = $entry->{hash}; sub as_string {
my $self = shift;
return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
}
# no matching entries package Directory::Simplify::Instruction::CopyTimestamp;
unless (defined $self->{_files}->{$hash}) { use strict;
return (); use warnings;
} use overload '""' => 'as_string';
my @matches; sub new {
foreach my $ent (@{$self->{_files}->{$hash}}) { my $class = shift;
if ($self->_entries_are_hard_linkable($entry, $ent)) { return bless {
push @matches, $ent; @_,
}, $class;
}
sub run {
my $self = shift;
# preserve older time stamp
utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name};
}
sub bytes_freed {
return 0;
}
sub as_string {
my $self = shift;
return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
}
package Directory::Simplify::Instruction::Generator;
use strict;
use warnings;
use overload '""' => 'as_string';
use File::Basename qw/ dirname /;
use File::Compare qw/ compare /;
sub new {
my $class = shift;
return bless {
filehash => undef,
min_size => 1,
@_,
}, $class;
}
sub as_string {
my $self = shift;
join "\n", $self->instructions;
}
sub buckets {
my $self = shift;
my @candidate_lists = $self->{filehash}->entries;
my @buckets;
foreach my $candidates (@candidate_lists) {
my @ca = @{$candidates}; # make a clone
my @these_buckets;
# at least two files needed to link together
if (@ca > 1) {
ELIMINATOR: while (@ca) {
my $entry = shift @ca;
next ELIMINATOR if $self->_entry_should_be_skipped($entry);
foreach my $bucket_idx (0 .. $#these_buckets) {
if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) {
push @{$these_buckets[$bucket_idx]}, $entry;
next ELIMINATOR;
}
}
# didn't find a bucket (even though the hash matched!)
# -> new bucket
push @these_buckets, [$entry];
}
} }
push @buckets, @these_buckets;
} }
return @matches; @buckets
} }
=head2 oldest($entry_a, $entry_b, ...) sub _entry_should_be_skipped {
my ($self, $entry_a) = @_;
Find the file less embedded in the file system. # too small to be hard-linked
if ($entry_a->{size} < $self->{min_size}) {
=cut return 1;
}
sub less_linked { return 0;
my $self = shift;
return sort
{$a->{nlink} <=> $b->{nlink}}
@_;
}
=head2 oldest($entry_a, $entry_b, ...)
Find the entry with the oldest time stamp.
=cut
sub oldest_mtime {
my $self = shift;
return sort
{$a->{mtime} <=> $b->{mtime}}
@_;
}
sub _hash {
my $self = shift;
my ($filename) = @_;
$self->{_ctx}->addfile($filename);
return $self->{_ctx}->hexdigest;
} }
sub _entries_are_hard_linkable { sub _entries_are_hard_linkable {
my $self = shift;
my ($entry_a, $entry_b) = @_; my ($entry_a, $entry_b) = @_;
# obviously, if the sizes aren't the same, they're not the same file # obviously, if the sizes aren't the same, they're not the same file
@ -410,15 +368,7 @@ sub _entries_are_hard_linkable {
return 0; return 0;
} }
# too small to be hard-linked
if ($entry_a->{size} < $self->min_linkable_size) {
return 0;
}
# they're the same file, don't try it # they're the same file, don't try it
if (&_entries_are_same_filename($entry_a, $entry_b)) {
return 0;
}
if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { if (&_entries_are_already_hard_linked($entry_a, $entry_b)) {
return 0; return 0;
} }
@ -429,13 +379,26 @@ sub _entries_are_hard_linkable {
return 0; return 0;
} }
sub _entries_are_same_filename { sub oldest_mtime {
my ($entry_a, $entry_b) = @_; my $self = shift;
return sort {
$a->{mtime} <=> $b->{mtime}
} @_;
}
if ($entry_a->{name} eq $entry_b->{name}) { sub more_linked {
return 1; my $self = shift;
} my %warned;
return 0; return sort {
if (! -w &dirname($a->{name})) {
warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++;
return 1; # favor a -> front
} elsif (! -w &dirname($b->{name})) {
warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++;
return -1; # favor b -> front
}
$b->{nlink} <=> $a->{nlink}
} @_;
} }
sub _entries_are_already_hard_linked { sub _entries_are_already_hard_linked {
@ -456,17 +419,10 @@ sub _entries_sizes_match {
} }
return 1; return 1;
} }
sub _entries_contents_match { sub _entries_contents_match {
my ($entry_a, $entry_b) = @_; my ($entry_a, $entry_b) = @_;
# also, if the hashes aren't the same, they cannot be the same file my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name}));
if ($entry_a->{hash} ne $entry_b->{hash}) {
return 0;
}
use File::Compare qw/ compare /;
my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name}));
# warn about hash collision # warn about hash collision
unless ($contents_same) { unless ($contents_same) {
@ -474,3 +430,92 @@ sub _entries_contents_match {
} }
return $contents_same; return $contents_same;
} }
# generate hardlink instructions
sub instructions {
require Scalar::Util;
my $self = shift;
# start generating instructions
my @inst;
foreach my $bucket ($self->buckets) {
# of the bucket, find the oldest timestamp
my ($oldest_entry) = $self->oldest_mtime(@{$bucket});
# of the bucket, find the file most embedded in the file system
my @to_link = $self->more_linked(@{$bucket});
my $most_linked_entry = shift @to_link;
foreach my $entry (@to_link) {
# XXX there shouldn't be a need to update entries' link counts,
# since this generates all the instructions at once
push @inst, Directory::Simplify::Instruction::Hardlink->new(
source => $most_linked_entry,
target => $entry,
);
push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
source => $oldest_entry,
target => $entry,
);
}
if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) {
# most_linked_entry should get its timestamp updated
push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
source => $oldest_entry,
target => $most_linked_entry,
);
}
}
@inst
}
package Directory::Simplify::FileHash;
use strict;
use warnings;
require Digest::SHA;
=head1 DESCRIPTION
Object for abstracting management of a hashed filesystem
=cut
sub new {
my $class = shift;
return bless {
_entries => {},
_files_in_hash => {},
@_,
}, $class;
}
sub add {
require Cwd;
my $self = shift;
my $ctx = $self->{_ctx};
unless (defined $ctx) {
$ctx = $self->{_ctx} = Digest::SHA->new;
}
foreach my $filename (@_) {
$filename = Cwd::abs_path($filename);
unless ($self->{_files_in_hash}->{$filename}) {
my $entry = {};
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
atime mtime ctime blksize blocks /})
= ($filename, lstat $filename);
$ctx->addfile($filename);
my $hash = $ctx->hexdigest;
unless (defined $self->{_entries}->{$hash}) {
$self->{_entries}->{$hash} = [];
}
push @{$self->{_entries}->{$hash}}, $entry;
}
$self->{_files_in_hash}->{$filename} = 1;
}
}
sub entries {
my $self = shift;
values %{$self->{_entries}}
}