From b1b4e18e053b65c4f049e9107ce1accc33219182 Mon Sep 17 00:00:00 2001 From: Dan Church Date: Wed, 16 Nov 2022 11:33:08 -0600 Subject: [PATCH] Break out packages into separate files --- .gitignore | 1 + Directory/Simplify/File.pm | 31 + Directory/Simplify/FileHash.pm | 57 ++ .../Simplify/Instruction/CopyTimestamp.pm | 33 + Directory/Simplify/Instruction/Generator.pm | 180 ++++++ Directory/Simplify/Instruction/Hardlink.pm | 46 ++ Directory/Simplify/Utils.pm | 47 ++ make-allinone.sh | 13 + simplify_static_dir-main.pl | 257 ++++++++ simplify_static_dir.pl | 611 ------------------ util/squash | 60 ++ 11 files changed, 725 insertions(+), 611 deletions(-) create mode 100644 .gitignore create mode 100644 Directory/Simplify/File.pm create mode 100644 Directory/Simplify/FileHash.pm create mode 100644 Directory/Simplify/Instruction/CopyTimestamp.pm create mode 100644 Directory/Simplify/Instruction/Generator.pm create mode 100644 Directory/Simplify/Instruction/Hardlink.pm create mode 100644 Directory/Simplify/Utils.pm create mode 100755 make-allinone.sh create mode 100755 simplify_static_dir-main.pl delete mode 100755 simplify_static_dir.pl create mode 100755 util/squash diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b5d2e09 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/simplify_static_dir.pl diff --git a/Directory/Simplify/File.pm b/Directory/Simplify/File.pm new file mode 100644 index 0000000..6591751 --- /dev/null +++ b/Directory/Simplify/File.pm @@ -0,0 +1,31 @@ +package Directory::Simplify::File; +# vi: et sts=4 sw=4 ts=4 +use strict; +use warnings; +require Cwd; + +sub new { + my $class = shift; + my $rel_name = shift; + my $self = bless { + rel_name => $rel_name, + name => Cwd::abs_path($rel_name), + }, $class; + (@{$self}{qw/ dev ino mode nlink uid gid rdev size + atime mtime ctime blksize blocks /}) + = lstat $self->{name}; + $self +} + +sub hash { + my $self = shift; + unless (defined $self->{_hash}) { + require Digest::SHA; + my $ctx = Digest::SHA->new; + $ctx->addfile($self->{name}); + $self->{_hash} = $ctx->hexdigest; + } + $self->{_hash} +} + +1; diff --git a/Directory/Simplify/FileHash.pm b/Directory/Simplify/FileHash.pm new file mode 100644 index 0000000..1e8db4f --- /dev/null +++ b/Directory/Simplify/FileHash.pm @@ -0,0 +1,57 @@ +package Directory::Simplify::FileHash; +use strict; +use warnings; + +=head1 DESCRIPTION + +Object for abstracting management of a hashed filesystem + +=cut + +# :squash-ignore-start: +require Directory::Simplify::File; +# :squash-ignore-end: + +sub new { + my $class = shift; + return bless { + _entries => {}, + _files_in_hash => {}, + @_, + }, $class; +} + +sub add { + my $self = shift; + my (@files, $callback); + if (ref $_[0] eq 'HASH') { + # Called method like { files => [] } + my %opts = %{$_[0]}; + @files = @{$opts{files}}; + $callback = $opts{callback}; + } else { + @files = @_; + } + foreach my $file (@files) { + unless (ref $file eq 'Directory::Simplify::File') { + $file = Directory::Simplify::File->new($file); + } + unless ($self->{_files_in_hash}->{$file->{name}}) { + my $hash = $file->hash; + + unless (defined $self->{_entries}->{$hash}) { + $self->{_entries}->{$hash} = []; + } + push @{$self->{_entries}->{$hash}}, $file; + &{$callback}($file) if ref $callback eq 'CODE'; + } + $self->{_files_in_hash}->{$file->{name}} = 1; + } +} + +sub entries { + my $self = shift; + values %{$self->{_entries}} +} + +1; diff --git a/Directory/Simplify/Instruction/CopyTimestamp.pm b/Directory/Simplify/Instruction/CopyTimestamp.pm new file mode 100644 index 0000000..491a4f4 --- /dev/null +++ b/Directory/Simplify/Instruction/CopyTimestamp.pm @@ -0,0 +1,33 @@ +package Directory::Simplify::Instruction::CopyTimestamp; +# vi: et sts=4 sw=4 ts=4 +use strict; +use warnings; +use overload '""' => 'as_string'; + +# :squash-ignore-start: +require Directory::Simplify::Utils; +# :squash-ignore-end: + +sub new { + my $class = shift; + return bless { + @_, + }, $class; +} + +sub run { + my $self = shift; + # preserve older time stamp + utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name}; +} + +sub bytes_freed { + return 0; +} + +sub as_string { + my $self = shift; + return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); +} + +1; diff --git a/Directory/Simplify/Instruction/Generator.pm b/Directory/Simplify/Instruction/Generator.pm new file mode 100644 index 0000000..3567d05 --- /dev/null +++ b/Directory/Simplify/Instruction/Generator.pm @@ -0,0 +1,180 @@ +package Directory::Simplify::Instruction::Generator; +# vi: et sts=4 sw=4 ts=4 +use strict; +use warnings; +use overload '""' => 'as_string'; +use File::Basename qw/ dirname /; +use File::Compare qw/ compare /; + +# :squash-ignore-start: +require Directory::Simplify::Instruction::CopyTimestamp; +require Directory::Simplify::Instruction::Hardlink; +# :squash-ignore-end: + +sub new { + my $class = shift; + return bless { + filehash => undef, + min_size => 1, + @_, + }, $class; +} + +sub as_string { + my $self = shift; + join "\n", $self->instructions; +} + +sub buckets { + my $self = shift; + + my @candidate_lists = $self->{filehash}->entries; + + my @buckets; + foreach my $candidates (@candidate_lists) { + my @ca = @{$candidates}; # make a clone + my @these_buckets; + + # at least two files needed to link together + if (@ca > 1) { + ELIMINATOR: while (@ca) { + my $entry = shift @ca; + + next ELIMINATOR if $self->_entry_should_be_skipped($entry); + + foreach my $bucket_idx (0 .. $#these_buckets) { + if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { + push @{$these_buckets[$bucket_idx]}, $entry; + next ELIMINATOR; + } + } + # didn't find a bucket (even though the hash matched!) + # -> new bucket + push @these_buckets, [$entry]; + } + } + push @buckets, @these_buckets; + } + + @buckets +} + +sub _entry_should_be_skipped { + my ($self, $entry_a) = @_; + # too small to be hard-linked + if ($entry_a->{size} < $self->{min_size}) { + return 1; + } + return 0; +} + +sub _entries_are_hard_linkable { + my ($entry_a, $entry_b) = @_; + + # obviously, if the sizes aren't the same, they're not the same file + unless (&_entries_sizes_match($entry_a, $entry_b)) { + return 0; + } + + # they're the same file, don't try it + if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { + return 0; + } + + if (&_entries_contents_match($entry_a, $entry_b)) { + return 1; + } + return 0; +} + +sub oldest_mtime { + my $self = shift; + return sort { + $a->{mtime} <=> $b->{mtime} + } @_; +} + +sub more_linked { + my $self = shift; + my %warned; + return sort { + if (! -w &dirname($a->{name})) { + warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++; + return 1; # favor a -> front + } elsif (! -w &dirname($b->{name})) { + warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++; + return -1; # favor b -> front + } + $b->{nlink} <=> $a->{nlink} + } @_; +} + +sub _entries_are_already_hard_linked { + my ($entry_a, $entry_b) = @_; + + if ($entry_a->{ino} == $entry_b->{ino} and + $entry_a->{dev} == $entry_b->{dev}) { + return 1; + } + + return 0; +} + +sub _entries_sizes_match { + my ($entry_a, $entry_b) = @_; + if ($entry_a->{size} != $entry_b->{size}) { + return 0; + } + return 1; +} +sub _entries_contents_match { + my ($entry_a, $entry_b) = @_; + + my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name})); + + # warn about hash collision + unless ($contents_same) { + warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n"; + } + return $contents_same; +} + +# generate hardlink instructions +sub instructions { + require Scalar::Util; + my $self = shift; + + # start generating instructions + my @inst; + foreach my $bucket ($self->buckets) { + + # of the bucket, find the oldest timestamp + my ($oldest_entry) = $self->oldest_mtime(@{$bucket}); + + # of the bucket, find the file most embedded in the file system + my @to_link = $self->more_linked(@{$bucket}); + my $most_linked_entry = shift @to_link; + foreach my $entry (@to_link) { + # XXX there shouldn't be a need to update entries' link counts, + # since this generates all the instructions at once + push @inst, Directory::Simplify::Instruction::Hardlink->new( + source => $most_linked_entry, + target => $entry, + ); + push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( + source => $oldest_entry, + target => $entry, + ); + } + if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) { + # most_linked_entry should get its timestamp updated + push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( + source => $oldest_entry, + target => $most_linked_entry, + ); + } + } + @inst +} + +1; diff --git a/Directory/Simplify/Instruction/Hardlink.pm b/Directory/Simplify/Instruction/Hardlink.pm new file mode 100644 index 0000000..9a7e7c0 --- /dev/null +++ b/Directory/Simplify/Instruction/Hardlink.pm @@ -0,0 +1,46 @@ +package Directory::Simplify::Instruction::Hardlink; +# vi: et sts=4 sw=4 ts=4 +use strict; +use warnings; +use overload '""' => 'as_string'; + +# :squash-ignore-start: +require Directory::Simplify::Utils; +# :squash-ignore-end: + +sub new { + my $class = shift; + return bless { + freed => 0, + @_, + }, $class; +} + +sub run { + my $self = shift; + # hard link the files + + unless (unlink $self->{target}->{name}) { + die "Failed to remove file `$self->{target}->{name}': $!\n"; + } + unless (link $self->{source}->{name}, $self->{target}->{name}) { + die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!"; + } + # bookkeeping + ++$self->{source}->{nlink}; + if (--$self->{target}->{nlink} == 0) { + $self->{freed} = $self->{target}->{size}; + } +} + +sub bytes_freed { + my $self = shift; + return $self->{freed}; +} + +sub as_string { + my $self = shift; + return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); +} + +1; diff --git a/Directory/Simplify/Utils.pm b/Directory/Simplify/Utils.pm new file mode 100644 index 0000000..5c05489 --- /dev/null +++ b/Directory/Simplify/Utils.pm @@ -0,0 +1,47 @@ +package Directory::Simplify::Utils; +# vi: et sts=4 sw=4 ts=4 +use strict; +use warnings; + +sub addcommas { + my @added; + foreach my $num (@_) { + # don't split anything after the decimal + my @parts = split /\./, $num; + while ($parts[0] =~ s/(\d)(\d{3}(?:\D|$))/$1,$2/) { + } + push @added, (join '.', @parts); + } + wantarray ? @added : $added[0] +} + +sub hr_size { + my $sz = shift; + my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; + my $fact = 1024; + my $thresh = 0.1; + my @ret; + foreach my $exp (reverse 0 .. $#sizes) { + if ($sz > (1 - $thresh) * $fact ** $exp) { + @ret = ($sz / $fact ** $exp, $sizes[$exp]); + last; + } + } + + # default to ($sz, 'bytes') + @ret = ($sz, $sizes[0]) unless @ret; + + wantarray ? @ret : "@ret" +} + +sub shell_quote { + # shell-escape argument for inclusion in non-interpolated single quotes + my @transformed = map { + (my $out = $_) + =~ s/'/'\\''/g; + "'$out'"; + } @_; + wantarray ? @transformed : $transformed[0]; +} + +1; diff --git a/make-allinone.sh b/make-allinone.sh new file mode 100755 index 0000000..5f201d9 --- /dev/null +++ b/make-allinone.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# vi: et sts=4 sw=4 ts=4 +WORKDIR=${0%/*} +OUT=$WORKDIR/simplify_static_dir.pl + +echo "Outputting to $OUT" >&2 + +shopt -s globstar +"$WORKDIR/util/squash" \ + "$WORKDIR/simplify_static_dir-main.pl" \ + "$WORKDIR"/**/*.pm \ + > "$OUT" +chmod +x -- "$OUT" diff --git a/simplify_static_dir-main.pl b/simplify_static_dir-main.pl new file mode 100755 index 0000000..e69e39a --- /dev/null +++ b/simplify_static_dir-main.pl @@ -0,0 +1,257 @@ +#!/usr/bin/perl +# vi: et sts=4 sw=4 ts=4 + +package main; +use strict; +use warnings; +use List::Util qw/ sum /; + +our $VERSION = '3.1.0'; + +=pod + +=head1 NAME + +simplify_static_dir - optimize directories for size by combining identical +files + +=head1 SYNOPSIS + +B [I] [I]... + +=head1 DESCRIPTION + +The more files this script can process at once, the better. It maintains an +internal hash of files indexed by their SHA1 checksum. When it finds a +collision it removes the file with the least amount of file system links, then +creates a hard link to the other in its place. The larger the number of files +scanned, the more likely it is that there will be collisions. + +There are failsafes in place, though. If somehow two files' SHA1 checksums are +identical, but the file sizes do not match, this program will error out (and +you can go ahead and report the collision; it may be worth money). + +There are other features built in as well. Following the logic that unique +data, once created has the attribute of being unique of that point in time, +this script will copy the timestamp of the older file onto the newer file just +before it makes the hard link. Therefore, many times, simplified directories +will have the appearance of being older than they actually are. + +From the perspective of any program reading files from the simplified +directories, the files will lookB<*> and behave the same way as the initial +state. + +B<*> Except for having an increased number of hard links. + +=head1 OPTIONS + +=over + +=item B<-v> + +Verbose output. + +=item B<-f> + +Print a sum of the number of freed bytes. + +=item B<-m> I + +Only match file paths matching I. + +=item B<-M> I + +Exclude file paths matching I. + +=item B<-z> + +Include zero-length files in search. Normally they are ignored (you don't save +diskspace by hard-linking empty files). + +=item B<--help> + +Output this help message and exit. + +=item B<--version> + +Output version information and exit. + +=back + +By default, scans the current directory. + +=head1 COPYRIGHT + +Copyright (C) 2010-2022 Dan Church. + +License GPLv3: GNU GPL version 3.0 (L) + +with Commons Clause 1.0 (L). + +This is free software: you are free to change and redistribute it. + +There is NO WARRANTY, to the extent permitted by law. + +You may NOT use this software for commercial purposes. + +=head1 AUTHOR + +Written by Dan Church Samphetamachine@gmail.comE> + +=cut + +use File::Find qw/ find /; +use Getopt::Std qw/ getopts /; +use Pod::Usage qw/ pod2usage /; + +# :squash-ignore-start: +# (this prepends to the load path) +use FindBin qw//; +use lib $FindBin::RealBin; + +require Directory::Simplify::File; +require Directory::Simplify::FileHash; +require Directory::Simplify::Instruction::Generator; +require Directory::Simplify::Utils; +# :squash-ignore-end: + +sub HELP_MESSAGE { + my $fh = shift; + &pod2usage( + -verbose => 1, + -exitval => 0, + ); +} + +MAIN: { + &getopts('vfm:M:z', \ my %opts) + || &pod2usage( + -exitval => 2, + -msg => "Try '$0 --help' for more information", + ); + + my $verbose = defined $opts{v}; + my $print_freed = defined $opts{f}; + my $files_match = $opts{m} || ''; + my $files_exclude = $opts{M} || ''; + my $include_zero_length_files = defined $opts{z}; + my $print_progress = $verbose; + + # correct relative paths + # OR if no directories given, search the current directory + my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD})); + + my @files; + print STDERR 'Finding files...' + if $verbose; + + &find(sub { + # outright skip directories (don't report skip) + return if -d $File::Find::name; + + # skip non-existent files and links + unless (-f $File::Find::name && ! -l $File::Find::name) { + return; + } + + push @files, Directory::Simplify::File->new($File::Find::name); + }, @dirs_to_process); + + printf STDERR "%d files found", + scalar @files + if $verbose; + + # Limit to or exclude file patterns specified by `-m' or `-M', respectively + # + # Truth table: + # -m matches | -M is used & matches | !return? + # 0 | 0 | 0 + # 0 | 1 | 0 + # 1 | 0 | 1 + # 1 | 1 | 0 + # note: m// will match everything + my $file_ct_before_filter = scalar @files; + @files = grep { + $_->{rel_name} =~ $files_match + } @files; + if ($file_ct_before_filter != scalar @files) { + printf STDERR " (%d files filtered by -m rule)", + $file_ct_before_filter - scalar @files + if $verbose; + } + if (length $files_exclude) { + $file_ct_before_filter = scalar @files; + @files = grep { + not $_->{rel_name} =~ $files_exclude + } @files; + if ($file_ct_before_filter != scalar @files) { + printf STDERR " (%d files filtered by -M rule)", + $file_ct_before_filter - scalar @files + if $verbose; + } + } + + # Shortcut: Only generate hashes and inspect files that do not have a + # unique size. The reasoning being that file sizes do not match, there's no + # possible way those two files can have the same contents. + my %file_sizes; + ++$file_sizes{$_->{size}} foreach @files; + @files = grep { + $file_sizes{$_->{size}} > 1 + } @files; + + printf STDERR " (%d candidates).\n", + scalar @files + if $verbose; + + unless (@files) { + printf STDERR "Nothing to do.\n"; + exit 0; + } + + print STDERR "Generating hashes..." if $verbose; + my $filehash = Directory::Simplify::FileHash->new; + my $report_every = 1; # seconds + my $processed_bytes = 0; + my $last_report = time; + my $total_size_hr = sprintf "%0.4G %s", Directory::Simplify::Utils::hr_size(&sum(map { $_->{size} } @files) or 0); + my $cb; + if ($print_progress) { + printf STDERR "\e\x{37}"; + $cb = sub { + my ($file, $now) = (shift, time); + $processed_bytes += $file->{size}; + if ($now >= $last_report + $report_every) { + printf STDERR "\e\x{38}%8s / %8s", + (sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)), + $total_size_hr; + $last_report = $now; + } + }; + } + $filehash->add({ + files => \@files, + callback => $cb, + }); + print STDERR "done.\n" + if $verbose; + + my $generator = Directory::Simplify::Instruction::Generator->new( + filehash => $filehash, + min_size => ($include_zero_length_files ? 0 : 1), + ); + + my $freed_bytes = 0; + + foreach my $inst ($generator->instructions) { + print STDERR $inst, "\n" if $verbose; + $inst->run; + $freed_bytes += $inst->bytes_freed; + } + + printf STDERR "freed %s bytes (%0.4G %s)\n", + Directory::Simplify::Utils::addcommas($freed_bytes), + Directory::Simplify::Utils::hr_size($freed_bytes) + if $print_freed or $verbose; +} + diff --git a/simplify_static_dir.pl b/simplify_static_dir.pl deleted file mode 100755 index 04e88e0..0000000 --- a/simplify_static_dir.pl +++ /dev/null @@ -1,611 +0,0 @@ -#!/usr/bin/perl -# vi: et sts=4 sw=4 ts=4 - -package main; -use strict; -use warnings; -use List::Util qw/ sum /; - -our $VERSION = '3.1.0'; - -=pod - -=head1 NAME - -simplify_static_dir - optimize directories for size by combining identical -files - -=head1 SYNOPSIS - -B [I] [I]... - -=head1 DESCRIPTION - -The more files this script can process at once, the better. It maintains an -internal hash of files indexed by their SHA1 checksum. When it finds a -collision it removes the file with the least amount of file system links, then -creates a hard link to the other in its place. The larger the number of files -scanned, the more likely it is that there will be collisions. - -There are failsafes in place, though. If somehow two files' SHA1 checksums are -identical, but the file sizes do not match, this program will error out (and -you can go ahead and report the collision; it may be worth money). - -There are other features built in as well. Following the logic that unique -data, once created has the attribute of being unique of that point in time, -this script will copy the timestamp of the older file onto the newer file just -before it makes the hard link. Therefore, many times, simplified directories -will have the appearance of being older than they actually are. - -From the perspective of any program reading files from the simplified -directories, the files will lookB<*> and behave the same way as the initial -state. - -B<*> Except for having an increased number of hard links. - -=head1 OPTIONS - -=over - -=item B<-v> - -Verbose output. - -=item B<-f> - -Print a sum of the number of freed bytes. - -=item B<-m> I - -Only match file paths matching I. - -=item B<-M> I - -Exclude file paths matching I. - -=item B<-z> - -Include zero-length files in search. Normally they are ignored (you don't save -diskspace by hard-linking empty files). - -=item B<--help> - -Output this help message and exit. - -=item B<--version> - -Output version information and exit. - -=back - -By default, scans the current directory. - -=head1 COPYRIGHT - -Copyright (C) 2010-2022 Dan Church. - -License GPLv3: GNU GPL version 3.0 (L) - -with Commons Clause 1.0 (L). - -This is free software: you are free to change and redistribute it. - -There is NO WARRANTY, to the extent permitted by law. - -You may NOT use this software for commercial purposes. - -=head1 AUTHOR - -Written by Dan Church Samphetamachine@gmail.comE> - -=cut - -use File::Find qw/ find /; -use Getopt::Std qw/ getopts /; -use Pod::Usage qw/ pod2usage /; - -sub HELP_MESSAGE { - my $fh = shift; - &pod2usage( - -verbose => 1, - -exitval => 0, - ); -} - -MAIN: { - &getopts('vfm:M:z', \ my %opts) - || &pod2usage( - -exitval => 2, - -msg => "Try '$0 --help' for more information", - ); - - my $verbose = defined $opts{v}; - my $print_freed = defined $opts{f}; - my $files_match = $opts{m} || ''; - my $files_exclude = $opts{M} || ''; - my $include_zero_length_files = defined $opts{z}; - my $print_progress = $verbose; - - # correct relative paths - # OR if no directories given, search the current directory - my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD})); - - my @files; - print STDERR 'Finding files...' - if $verbose; - - &find(sub { - # outright skip directories (don't report skip) - return if -d $File::Find::name; - - # skip non-existent files and links - unless (-f $File::Find::name && ! -l $File::Find::name) { - return; - } - - push @files, Directory::Simplify::File->new($File::Find::name); - }, @dirs_to_process); - - printf STDERR "%d files found", - scalar @files - if $verbose; - - # Limit to or exclude file patterns specified by `-m' or `-M', respectively - # - # Truth table: - # -m matches | -M is used & matches | !return? - # 0 | 0 | 0 - # 0 | 1 | 0 - # 1 | 0 | 1 - # 1 | 1 | 0 - # note: m// will match everything - my $file_ct_before_filter = scalar @files; - @files = grep { - $_->{rel_name} =~ $files_match - } @files; - if ($file_ct_before_filter != scalar @files) { - printf STDERR " (%d files filtered by -m rule)", - $file_ct_before_filter - scalar @files - if $verbose; - } - if (length $files_exclude) { - $file_ct_before_filter = scalar @files; - @files = grep { - not $_->{rel_name} =~ $files_exclude - } @files; - if ($file_ct_before_filter != scalar @files) { - printf STDERR " (%d files filtered by -M rule)", - $file_ct_before_filter - scalar @files - if $verbose; - } - } - - # Shortcut: Only generate hashes and inspect files that do not have a - # unique size. The reasoning being that file sizes do not match, there's no - # possible way those two files can have the same contents. - my %file_sizes; - ++$file_sizes{$_->{size}} foreach @files; - @files = grep { - $file_sizes{$_->{size}} > 1 - } @files; - - printf STDERR " (%d candidates).\n", - scalar @files - if $verbose; - - unless (@files) { - printf STDERR "Nothing to do.\n"; - exit 0; - } - - print STDERR "Generating hashes..." if $verbose; - my $filehash = Directory::Simplify::FileHash->new; - my $report_every = 1; # seconds - my $processed_bytes = 0; - my $last_report = time; - my $total_size_hr = sprintf "%0.4G %s", Directory::Simplify::Utils::hr_size(&sum(map { $_->{size} } @files) or 0); - my $cb; - if ($print_progress) { - printf STDERR "\e\x{37}"; - $cb = sub { - my ($file, $now) = (shift, time); - $processed_bytes += $file->{size}; - if ($now >= $last_report + $report_every) { - printf STDERR "\e\x{38}%8s / %8s", - (sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)), - $total_size_hr; - $last_report = $now; - } - }; - } - $filehash->add({ - files => \@files, - callback => $cb, - }); - print STDERR "done.\n" - if $verbose; - - my $generator = Directory::Simplify::Instruction::Generator->new( - filehash => $filehash, - min_size => ($include_zero_length_files ? 0 : 1), - ); - - my $freed_bytes = 0; - - foreach my $inst ($generator->instructions) { - print STDERR $inst, "\n" if $verbose; - $inst->run; - $freed_bytes += $inst->bytes_freed; - } - - printf STDERR "freed %s bytes (%0.4G %s)\n", - Directory::Simplify::Utils::addcommas($freed_bytes), - Directory::Simplify::Utils::hr_size($freed_bytes) - if $print_freed or $verbose; -} - -package Directory::Simplify::Utils; -use strict; -use warnings; - -sub addcommas { - my @added; - foreach my $num (@_) { - # don't split anything after the decimal - my @parts = split /\./, $num; - while ($parts[0] =~ s/(\d)(\d{3}(?:\D|$))/$1,$2/) { - } - push @added, (join '.', @parts); - } - wantarray ? @added : $added[0] -} - -sub hr_size { - my $sz = shift; - my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; - my $fact = 1024; - my $thresh = 0.1; - my @ret; - foreach my $exp (reverse 0 .. $#sizes) { - if ($sz > (1 - $thresh) * $fact ** $exp) { - @ret = ($sz / $fact ** $exp, $sizes[$exp]); - last; - } - } - - # default to ($sz, 'bytes') - @ret = ($sz, $sizes[0]) unless @ret; - - wantarray ? @ret : "@ret" -} - -sub shell_quote { - # shell-escape argument for inclusion in non-interpolated single quotes - my @transformed = map { - (my $out = $_) - =~ s/'/'\\''/g; - "'$out'"; - } @_; - wantarray ? @transformed : $transformed[0]; -} - -package Directory::Simplify::Instruction::Hardlink; -use strict; -use warnings; -use overload '""' => 'as_string'; - -sub new { - my $class = shift; - return bless { - freed => 0, - @_, - }, $class; -} - -sub run { - my $self = shift; - # hard link the files - - unless (unlink $self->{target}->{name}) { - die "Failed to remove file `$self->{target}->{name}': $!\n"; - } - unless (link $self->{source}->{name}, $self->{target}->{name}) { - die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!"; - } - # bookkeeping - ++$self->{source}->{nlink}; - if (--$self->{target}->{nlink} == 0) { - $self->{freed} = $self->{target}->{size}; - } -} - -sub bytes_freed { - my $self = shift; - return $self->{freed}; -} - -sub as_string { - my $self = shift; - return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); -} - -package Directory::Simplify::Instruction::CopyTimestamp; -use strict; -use warnings; -use overload '""' => 'as_string'; - -sub new { - my $class = shift; - return bless { - @_, - }, $class; -} - -sub run { - my $self = shift; - # preserve older time stamp - utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name}; -} - -sub bytes_freed { - return 0; -} - -sub as_string { - my $self = shift; - return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name}); -} - -package Directory::Simplify::Instruction::Generator; -use strict; -use warnings; -use overload '""' => 'as_string'; -use File::Basename qw/ dirname /; -use File::Compare qw/ compare /; - -sub new { - my $class = shift; - return bless { - filehash => undef, - min_size => 1, - @_, - }, $class; -} - -sub as_string { - my $self = shift; - join "\n", $self->instructions; -} - -sub buckets { - my $self = shift; - - my @candidate_lists = $self->{filehash}->entries; - - my @buckets; - foreach my $candidates (@candidate_lists) { - my @ca = @{$candidates}; # make a clone - my @these_buckets; - - # at least two files needed to link together - if (@ca > 1) { - ELIMINATOR: while (@ca) { - my $entry = shift @ca; - - next ELIMINATOR if $self->_entry_should_be_skipped($entry); - - foreach my $bucket_idx (0 .. $#these_buckets) { - if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { - push @{$these_buckets[$bucket_idx]}, $entry; - next ELIMINATOR; - } - } - # didn't find a bucket (even though the hash matched!) - # -> new bucket - push @these_buckets, [$entry]; - } - } - push @buckets, @these_buckets; - } - - @buckets -} - -sub _entry_should_be_skipped { - my ($self, $entry_a) = @_; - # too small to be hard-linked - if ($entry_a->{size} < $self->{min_size}) { - return 1; - } - return 0; -} - -sub _entries_are_hard_linkable { - my ($entry_a, $entry_b) = @_; - - # obviously, if the sizes aren't the same, they're not the same file - unless (&_entries_sizes_match($entry_a, $entry_b)) { - return 0; - } - - # they're the same file, don't try it - if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { - return 0; - } - - if (&_entries_contents_match($entry_a, $entry_b)) { - return 1; - } - return 0; -} - -sub oldest_mtime { - my $self = shift; - return sort { - $a->{mtime} <=> $b->{mtime} - } @_; -} - -sub more_linked { - my $self = shift; - my %warned; - return sort { - if (! -w &dirname($a->{name})) { - warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++; - return 1; # favor a -> front - } elsif (! -w &dirname($b->{name})) { - warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++; - return -1; # favor b -> front - } - $b->{nlink} <=> $a->{nlink} - } @_; -} - -sub _entries_are_already_hard_linked { - my ($entry_a, $entry_b) = @_; - - if ($entry_a->{ino} == $entry_b->{ino} and - $entry_a->{dev} == $entry_b->{dev}) { - return 1; - } - - return 0; -} - -sub _entries_sizes_match { - my ($entry_a, $entry_b) = @_; - if ($entry_a->{size} != $entry_b->{size}) { - return 0; - } - return 1; -} -sub _entries_contents_match { - my ($entry_a, $entry_b) = @_; - - my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name})); - - # warn about hash collision - unless ($contents_same) { - warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n"; - } - return $contents_same; -} - -# generate hardlink instructions -sub instructions { - require Scalar::Util; - my $self = shift; - - # start generating instructions - my @inst; - foreach my $bucket ($self->buckets) { - - # of the bucket, find the oldest timestamp - my ($oldest_entry) = $self->oldest_mtime(@{$bucket}); - - # of the bucket, find the file most embedded in the file system - my @to_link = $self->more_linked(@{$bucket}); - my $most_linked_entry = shift @to_link; - foreach my $entry (@to_link) { - # XXX there shouldn't be a need to update entries' link counts, - # since this generates all the instructions at once - push @inst, Directory::Simplify::Instruction::Hardlink->new( - source => $most_linked_entry, - target => $entry, - ); - push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( - source => $oldest_entry, - target => $entry, - ); - } - if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) { - # most_linked_entry should get its timestamp updated - push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( - source => $oldest_entry, - target => $most_linked_entry, - ); - } - } - @inst -} - -package Directory::Simplify::File; -use strict; -use warnings; -require Cwd; - -sub new { - my $class = shift; - my $rel_name = shift; - my $self = bless { - rel_name => $rel_name, - name => Cwd::abs_path($rel_name), - }, $class; - (@{$self}{qw/ dev ino mode nlink uid gid rdev size - atime mtime ctime blksize blocks /}) - = lstat $self->{name}; - $self -} - -sub hash { - my $self = shift; - unless (defined $self->{_hash}) { - require Digest::SHA; - my $ctx = Digest::SHA->new; - $ctx->addfile($self->{name}); - $self->{_hash} = $ctx->hexdigest; - } - $self->{_hash} -} - -package Directory::Simplify::FileHash; -use strict; -use warnings; - -=head1 DESCRIPTION - -Object for abstracting management of a hashed filesystem - -=cut - -sub new { - my $class = shift; - return bless { - _entries => {}, - _files_in_hash => {}, - @_, - }, $class; -} - -sub add { - my $self = shift; - my (@files, $callback); - if (ref $_[0] eq 'HASH') { - # Called method like { files => [] } - my %opts = %{$_[0]}; - @files = @{$opts{files}}; - $callback = $opts{callback}; - } else { - @files = @_; - } - foreach my $file (@files) { - unless (ref $file eq 'Directory::Simplify::File') { - $file = Directory::Simplify::File->new($file); - } - unless ($self->{_files_in_hash}->{$file->{name}}) { - my $hash = $file->hash; - - unless (defined $self->{_entries}->{$hash}) { - $self->{_entries}->{$hash} = []; - } - push @{$self->{_entries}->{$hash}}, $file; - &{$callback}($file) if ref $callback eq 'CODE'; - } - $self->{_files_in_hash}->{$file->{name}} = 1; - } -} - -sub entries { - my $self = shift; - values %{$self->{_entries}} -} diff --git a/util/squash b/util/squash new file mode 100755 index 0000000..3403c19 --- /dev/null +++ b/util/squash @@ -0,0 +1,60 @@ +#!/usr/bin/perl +# vi: et sts=4 sw=4 ts=4 + +# Squashes together the parts of the app into the single script. +# (Adapted from the script that squashes App::Ack - see https://github.com/beyondgrep/ack3) +use warnings; +use strict; + +my $code; +for my $arg (@ARGV) { + my $filename = $arg; + if ($arg =~ /::/) { + my $key = "$arg.pm"; + $key =~ s{::}{/}g; + $filename = $INC{$key} or die "Can't find the file for $arg"; + } + + warn "Reading $filename\n"; + open my $fh, '<', $filename or die "Can't open $filename: $!"; + my $in_pod = 0; + my $in_ignore_section = 0; + while (<$fh>) { + if (/#.*:squash-ignore-start:$/) { + $in_ignore_section = 1; + next; + } elsif (/#.*:squash-ignore-end:$/) { + $in_ignore_section = 0; + next; + } + next if $in_ignore_section; + next if /#.*:squash-ignore-line:$/; + + next if /^\s*1;$/; + + if ($filename =~ /\.pm$/) { + # See if we're in module POD blocks + if (/^=(\w+)/) { + $in_pod = ($1 ne 'cut'); + next; + } + elsif ($in_pod) { + next; + } + next if /^# vi:/; + } + + # Remove Perl::Critic comments. + # I'd like to remove all comments, but this is a start + s{\s*##.+critic.*}{}; + $code .= $_; + } + close $fh; +} + +# Remove repeated newlines between paragraphs +$code =~ s/\n\n+/\n\n/gs; + +print $code; + +exit 0;