From b1b4e18e053b65c4f049e9107ce1accc33219182 Mon Sep 17 00:00:00 2001
From: Dan Church <amphetamachine@gmail.com>
Date: Wed, 16 Nov 2022 11:33:08 -0600
Subject: [PATCH] Break out packages into separate files

---
 .gitignore                                    |   1 +
 Directory/Simplify/File.pm                    |  31 +
 Directory/Simplify/FileHash.pm                |  57 ++
 .../Simplify/Instruction/CopyTimestamp.pm     |  33 +
 Directory/Simplify/Instruction/Generator.pm   | 180 ++++++
 Directory/Simplify/Instruction/Hardlink.pm    |  46 ++
 Directory/Simplify/Utils.pm                   |  47 ++
 make-allinone.sh                              |  13 +
 simplify_static_dir-main.pl                   | 257 ++++++++
 simplify_static_dir.pl                        | 611 ------------------
 util/squash                                   |  60 ++
 11 files changed, 725 insertions(+), 611 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 Directory/Simplify/File.pm
 create mode 100644 Directory/Simplify/FileHash.pm
 create mode 100644 Directory/Simplify/Instruction/CopyTimestamp.pm
 create mode 100644 Directory/Simplify/Instruction/Generator.pm
 create mode 100644 Directory/Simplify/Instruction/Hardlink.pm
 create mode 100644 Directory/Simplify/Utils.pm
 create mode 100755 make-allinone.sh
 create mode 100755 simplify_static_dir-main.pl
 delete mode 100755 simplify_static_dir.pl
 create mode 100755 util/squash

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b5d2e09
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/simplify_static_dir.pl
diff --git a/Directory/Simplify/File.pm b/Directory/Simplify/File.pm
new file mode 100644
index 0000000..6591751
--- /dev/null
+++ b/Directory/Simplify/File.pm
@@ -0,0 +1,31 @@
+package Directory::Simplify::File;
+# vi: et sts=4 sw=4 ts=4
+use strict;
+use warnings;
+require Cwd;
+
+sub new {
+    my $class = shift;
+    my $rel_name = shift;
+    my $self = bless {
+        rel_name => $rel_name,
+        name => Cwd::abs_path($rel_name),
+    }, $class;
+    (@{$self}{qw/ dev ino mode nlink uid gid rdev size
+                atime mtime ctime blksize blocks /})
+        = lstat $self->{name};
+    $self
+}
+
+sub hash {
+    my $self = shift;
+    unless (defined $self->{_hash}) {
+        require Digest::SHA;
+        my $ctx = Digest::SHA->new;
+        $ctx->addfile($self->{name});
+        $self->{_hash} = $ctx->hexdigest;
+    }
+    $self->{_hash}
+}
+
+1;
diff --git a/Directory/Simplify/FileHash.pm b/Directory/Simplify/FileHash.pm
new file mode 100644
index 0000000..1e8db4f
--- /dev/null
+++ b/Directory/Simplify/FileHash.pm
@@ -0,0 +1,57 @@
+package Directory::Simplify::FileHash;
+use strict;
+use warnings;
+
+=head1 DESCRIPTION
+
+Object for abstracting management of a hashed filesystem
+
+=cut
+
+# :squash-ignore-start:
+require Directory::Simplify::File;
+# :squash-ignore-end:
+
+sub new {
+    my $class = shift;
+    return bless {
+        _entries => {},
+        _files_in_hash => {},
+        @_,
+    }, $class;
+}
+
+sub add {
+    my $self = shift;
+    my (@files, $callback);
+    if (ref $_[0] eq 'HASH') {
+        # Called method like { files => [] }
+        my %opts = %{$_[0]};
+        @files = @{$opts{files}};
+        $callback = $opts{callback};
+    } else {
+        @files = @_;
+    }
+    foreach my $file (@files) {
+        unless (ref $file eq 'Directory::Simplify::File') {
+            $file = Directory::Simplify::File->new($file);
+        }
+        unless ($self->{_files_in_hash}->{$file->{name}}) {
+            my $hash = $file->hash;
+
+            unless (defined $self->{_entries}->{$hash}) {
+                $self->{_entries}->{$hash} = [];
+            }
+            push @{$self->{_entries}->{$hash}}, $file;
+            &{$callback}($file) if ref $callback eq 'CODE';
+        }
+        $self->{_files_in_hash}->{$file->{name}} = 1;
+    }
+}
+
+sub entries {
+    my $self = shift;
+    values %{$self->{_entries}}
+}
+
+1;
diff --git a/Directory/Simplify/Instruction/CopyTimestamp.pm b/Directory/Simplify/Instruction/CopyTimestamp.pm
new file mode 100644
index 0000000..491a4f4
--- /dev/null
+++ b/Directory/Simplify/Instruction/CopyTimestamp.pm
@@ -0,0 +1,33 @@
+package Directory::Simplify::Instruction::CopyTimestamp;
+# vi: et sts=4 sw=4 ts=4
+use strict;
+use warnings;
+use overload '""' => 'as_string';
+
+# :squash-ignore-start:
+require Directory::Simplify::Utils;
+# :squash-ignore-end:
+
+sub new {
+    my $class = shift;
+    return bless {
+        @_,
+    }, $class;
+}
+
+sub run {
+    my $self = shift;
+    # preserve older time stamp
+    utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name};
+}
+
+sub bytes_freed {
+    return 0;
+}
+
+sub as_string {
+    my $self = shift;
+    return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
+}
+
+1;
diff --git a/Directory/Simplify/Instruction/Generator.pm b/Directory/Simplify/Instruction/Generator.pm
new file mode 100644
index 0000000..3567d05
--- /dev/null
+++ b/Directory/Simplify/Instruction/Generator.pm
@@ -0,0 +1,180 @@
+package Directory::Simplify::Instruction::Generator;
+# vi: et sts=4 sw=4 ts=4
+use strict;
+use warnings;
+use overload '""' => 'as_string';
+use File::Basename qw/ dirname /;
+use File::Compare qw/ compare /;
+
+# :squash-ignore-start:
+require Directory::Simplify::Instruction::CopyTimestamp;
+require Directory::Simplify::Instruction::Hardlink;
+# :squash-ignore-end:
+
+sub new {
+    my $class = shift;
+    return bless {
+        filehash => undef,
+        min_size => 1,
+        @_,
+    }, $class;
+}
+
+sub as_string {
+    my $self = shift;
+    join "\n", $self->instructions;
+}
+
+sub buckets {
+    my $self = shift;
+
+    my @candidate_lists = $self->{filehash}->entries;
+
+    my @buckets;
+    foreach my $candidates (@candidate_lists) {
+        my @ca = @{$candidates}; # make a clone
+        my @these_buckets;
+
+        # at least two files needed to link together
+        if (@ca > 1) {
+            ELIMINATOR: while (@ca) {
+                my $entry = shift @ca;
+
+                next ELIMINATOR if $self->_entry_should_be_skipped($entry);
+
+                foreach my $bucket_idx (0 .. $#these_buckets) {
+                    if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) {
+                        push @{$these_buckets[$bucket_idx]}, $entry;
+                        next ELIMINATOR;
+                    }
+                }
+                # didn't find a bucket (even though the hash matched!)
+                # -> new bucket
+                push @these_buckets, [$entry];
+            }
+        }
+        push @buckets, @these_buckets;
+    }
+
+    @buckets
+}
+
+sub _entry_should_be_skipped {
+    my ($self, $entry_a) = @_;
+    # too small to be hard-linked
+    if ($entry_a->{size} < $self->{min_size}) {
+        return 1;
+    }
+    return 0;
+}
+
+sub _entries_are_hard_linkable {
+    my ($entry_a, $entry_b) = @_;
+
+    # obviously, if the sizes aren't the same, they're not the same file
+    unless (&_entries_sizes_match($entry_a, $entry_b)) {
+        return 0;
+    }
+
+    # they're the same file, don't try it
+    if (&_entries_are_already_hard_linked($entry_a, $entry_b)) {
+        return 0;
+    }
+
+    if (&_entries_contents_match($entry_a, $entry_b)) {
+        return 1;
+    }
+    return 0;
+}
+
+sub oldest_mtime {
+    my $self = shift;
+    return sort {
+        $a->{mtime} <=> $b->{mtime}
+    } @_;
+}
+
+sub more_linked {
+    my $self = shift;
+    my %warned;
+    return sort {
+        if (! -w &dirname($a->{name})) {
+            warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++;
+            return 1; # favor a -> front
+        } elsif (! -w &dirname($b->{name})) {
+            warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++;
+            return -1; # favor b -> front
+        }
+        $b->{nlink} <=> $a->{nlink}
+    } @_;
+}
+
+sub _entries_are_already_hard_linked {
+    my ($entry_a, $entry_b) = @_;
+
+    if ($entry_a->{ino} == $entry_b->{ino} and
+        $entry_a->{dev} == $entry_b->{dev}) {
+        return 1;
+    }
+
+    return 0;
+}
+
+sub _entries_sizes_match {
+    my ($entry_a, $entry_b) = @_;
+    if ($entry_a->{size} != $entry_b->{size}) {
+        return 0;
+    }
+    return 1;
+}
+sub _entries_contents_match {
+    my ($entry_a, $entry_b) = @_;
+
+    my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name}));
+
+    # warn about hash collision
+    unless ($contents_same) {
+        warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n  (don't panic)\n";
+    }
+    return $contents_same;
+}
+
+# generate hardlink instructions
+sub instructions {
+    require Scalar::Util;
+    my $self = shift;
+
+    # start generating instructions
+    my @inst;
+    foreach my $bucket ($self->buckets) {
+
+        # of the bucket, find the oldest timestamp
+        my ($oldest_entry) = $self->oldest_mtime(@{$bucket});
+
+        # of the bucket, find the file most embedded in the file system
+        my @to_link = $self->more_linked(@{$bucket});
+        my $most_linked_entry = shift @to_link;
+        foreach my $entry (@to_link) {
+            # XXX there shouldn't be a need to update entries' link counts,
+            # since this generates all the instructions at once
+            push @inst, Directory::Simplify::Instruction::Hardlink->new(
+                source => $most_linked_entry,
+                target => $entry,
+            );
+            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
+                source => $oldest_entry,
+                target => $entry,
+            );
+        }
+        if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) {
+            # most_linked_entry should get its timestamp updated
+            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
+                source => $oldest_entry,
+                target => $most_linked_entry,
+            );
+        }
+    }
+    @inst
+}
+
+1;
diff --git a/Directory/Simplify/Instruction/Hardlink.pm b/Directory/Simplify/Instruction/Hardlink.pm
new file mode 100644
index 0000000..9a7e7c0
--- /dev/null
+++ b/Directory/Simplify/Instruction/Hardlink.pm
@@ -0,0 +1,46 @@
+package Directory::Simplify::Instruction::Hardlink;
+# vi: et sts=4 sw=4 ts=4
+use strict;
+use warnings;
+use overload '""' => 'as_string';
+
+# :squash-ignore-start:
+require Directory::Simplify::Utils;
+# :squash-ignore-end:
+
+sub new {
+    my $class = shift;
+    return bless {
+        freed => 0,
+        @_,
+    }, $class;
+}
+
+sub run {
+    my $self = shift;
+    # hard link the files
+
+    unless (unlink $self->{target}->{name}) {
+        die "Failed to remove file `$self->{target}->{name}': $!\n";
+    }
+    unless (link $self->{source}->{name}, $self->{target}->{name}) {
+        die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!";
+    }
+    # bookkeeping
+    ++$self->{source}->{nlink};
+    if (--$self->{target}->{nlink} == 0) {
+        $self->{freed} = $self->{target}->{size};
+    }
+}
+
+sub bytes_freed {
+    my $self = shift;
+    return $self->{freed};
+}
+
+sub as_string {
+    my $self = shift;
+    return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
+}
+
+1;
diff --git a/Directory/Simplify/Utils.pm b/Directory/Simplify/Utils.pm
new file mode 100644
index 0000000..5c05489
--- /dev/null
+++ b/Directory/Simplify/Utils.pm
@@ -0,0 +1,47 @@
+package Directory::Simplify::Utils;
+# vi: et sts=4 sw=4 ts=4
+use strict;
+use warnings;
+
+sub addcommas {
+    my @added;
+    foreach my $num (@_) {
+        # don't split anything after the decimal
+        my @parts = split /\./, $num;
+        while ($parts[0] =~ s/(\d)(\d{3}(?:\D|$))/$1,$2/) {
+        }
+        push @added, (join '.', @parts);
+    }
+    wantarray ? @added : $added[0]
+}
+
+sub hr_size {
+    my $sz = shift;
+    my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
+    my $fact = 1024;
+    my $thresh = 0.1;
+    my @ret;
+    foreach my $exp (reverse 0 .. $#sizes) {
+        if ($sz > (1 - $thresh) * $fact ** $exp) {
+            @ret = ($sz / $fact ** $exp, $sizes[$exp]);
+            last;
+        }
+    }
+
+    # default to ($sz, 'bytes')
+    @ret = ($sz, $sizes[0]) unless @ret;
+
+    wantarray ? @ret : "@ret"
+}
+
+sub shell_quote {
+    # shell-escape argument for inclusion in non-interpolated single quotes
+    my @transformed = map {
+        (my $out = $_)
+            =~ s/'/'\\''/g;
+        "'$out'";
+    } @_;
+    wantarray ? @transformed : $transformed[0];
+}
+
+1;
diff --git a/make-allinone.sh b/make-allinone.sh
new file mode 100755
index 0000000..5f201d9
--- /dev/null
+++ b/make-allinone.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# vi: et sts=4 sw=4 ts=4
+WORKDIR=${0%/*}
+OUT=$WORKDIR/simplify_static_dir.pl
+
+echo "Outputting to $OUT" >&2
+
+shopt -s globstar
+"$WORKDIR/util/squash" \
+    "$WORKDIR/simplify_static_dir-main.pl" \
+    "$WORKDIR"/**/*.pm \
+    > "$OUT"
+chmod +x -- "$OUT"
diff --git a/simplify_static_dir-main.pl b/simplify_static_dir-main.pl
new file mode 100755
index 0000000..e69e39a
--- /dev/null
+++ b/simplify_static_dir-main.pl
@@ -0,0 +1,257 @@
+#!/usr/bin/perl
+# vi: et sts=4 sw=4 ts=4
+
+package main;
+use strict;
+use warnings;
+use List::Util qw/ sum /;
+
+our $VERSION = '3.1.0';
+
+=pod
+
+=head1 NAME
+
+simplify_static_dir - optimize directories for size by combining identical
+files
+
+=head1 SYNOPSIS
+
+B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]...
+
+=head1 DESCRIPTION
+
+The more files this script can process at once, the better. It maintains an
+internal hash of files indexed by their SHA1 checksum. When it finds a
+collision it removes the file with the least amount of file system links, then
+creates a hard link to the other in its place. The larger the number of files
+scanned, the more likely it is that there will be collisions.
+
+There are failsafes in place, though. If somehow two files' SHA1 checksums are
+identical, but the file sizes do not match, this program will error out (and
+you can go ahead and report the collision; it may be worth money).
+
+There are other features built in as well. Following the logic that unique
+data, once created has the attribute of being unique of that point in time,
+this script will copy the timestamp of the older file onto the newer file just
+before it makes the hard link. Therefore, many times, simplified directories
+will have the appearance of being older than they actually are.
+
+From the perspective of any program reading files from the simplified
+directories, the files will lookB<*> and behave the same way as the initial
+state.
+
+B<*> Except for having an increased number of hard links.
+
+=head1 OPTIONS
+
+=over
+
+=item B<-v>
+
+Verbose output.
+
+=item B<-f>
+
+Print a sum of the number of freed bytes.
+
+=item B<-m> I<REGEX>
+
+Only match file paths matching I<REGEX>.
+
+=item B<-M> I<REGEX>
+
+Exclude file paths matching I<REGEX>.
+
+=item B<-z>
+
+Include zero-length files in search. Normally they are ignored (you don't save
+diskspace by hard-linking empty files).
+
+=item B<--help>
+
+Output this help message and exit.
+
+=item B<--version>
+
+Output version information and exit.
+
+=back
+
+By default, scans the current directory.
+
+=head1 COPYRIGHT
+
+Copyright (C) 2010-2022 Dan Church.
+
+License GPLv3: GNU GPL version 3.0 (L<https://www.gnu.org/licenses/gpl-3.0.html>)
+
+with Commons Clause 1.0 (L<https://commonsclause.com/>).
+
+This is free software: you are free to change and redistribute it.
+
+There is NO WARRANTY, to the extent permitted by law.
+
+You may NOT use this software for commercial purposes.
+
+=head1 AUTHOR
+
+Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
+
+=cut
+
+use File::Find qw/ find /;
+use Getopt::Std qw/ getopts /;
+use Pod::Usage qw/ pod2usage /;
+
+# :squash-ignore-start:
+# (this prepends to the load path)
+use FindBin qw//;
+use lib $FindBin::RealBin;
+
+require Directory::Simplify::File;
+require Directory::Simplify::FileHash;
+require Directory::Simplify::Instruction::Generator;
+require Directory::Simplify::Utils;
+# :squash-ignore-end:
+
+sub HELP_MESSAGE {
+    my $fh = shift;
+    &pod2usage(
+        -verbose => 1,
+        -exitval => 0,
+    );
+}
+
+MAIN: {
+    &getopts('vfm:M:z', \ my %opts)
+        || &pod2usage(
+            -exitval => 2,
+            -msg => "Try '$0 --help' for more information",
+        );
+
+    my $verbose = defined $opts{v};
+    my $print_freed = defined $opts{f};
+    my $files_match = $opts{m} || '';
+    my $files_exclude = $opts{M} || '';
+    my $include_zero_length_files = defined $opts{z};
+    my $print_progress = $verbose;
+
+    # correct relative paths
+    # OR if no directories given, search the current directory
+    my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));
+
+    my @files;
+    print STDERR 'Finding files...'
+        if $verbose;
+
+    &find(sub {
+        # outright skip directories (don't report skip)
+        return if -d $File::Find::name;
+
+        # skip non-existent files and links
+        unless (-f $File::Find::name && ! -l $File::Find::name) {
+            return;
+        }
+
+        push @files, Directory::Simplify::File->new($File::Find::name);
+    }, @dirs_to_process);
+
+    printf STDERR "%d files found",
+        scalar @files
+        if $verbose;
+
+    # Limit to or exclude file patterns specified by `-m' or `-M', respectively
+    #
+    # Truth table:
+    # -m matches    | -M is used & matches  | !return?
+    # 0             | 0                     | 0
+    # 0             | 1                     | 0
+    # 1             | 0                     | 1
+    # 1             | 1                     | 0
+    # note: m// will match everything
+    my $file_ct_before_filter = scalar @files;
+    @files = grep {
+        $_->{rel_name} =~ $files_match
+    } @files;
+    if ($file_ct_before_filter != scalar @files) {
+        printf STDERR " (%d files filtered by -m rule)",
+            $file_ct_before_filter - scalar @files
+            if $verbose;
+    }
+    if (length $files_exclude) {
+        $file_ct_before_filter = scalar @files;
+        @files = grep {
+            not $_->{rel_name} =~ $files_exclude
+        } @files;
+        if ($file_ct_before_filter != scalar @files) {
+            printf STDERR " (%d files filtered by -M rule)",
+                $file_ct_before_filter - scalar @files
+                if $verbose;
+        }
+    }
+
+    # Shortcut: Only generate hashes and inspect files that do not have a
+    # unique size. The reasoning being that file sizes do not match, there's no
+    # possible way those two files can have the same contents.
+    my %file_sizes;
+    ++$file_sizes{$_->{size}} foreach @files;
+    @files = grep {
+        $file_sizes{$_->{size}} > 1
+    } @files;
+
+    printf STDERR " (%d candidates).\n",
+        scalar @files
+        if $verbose;
+
+    unless (@files) {
+        printf STDERR "Nothing to do.\n";
+        exit 0;
+    }
+
+    print STDERR "Generating hashes..." if $verbose;
+    my $filehash = Directory::Simplify::FileHash->new;
+    my $report_every = 1; # seconds
+    my $processed_bytes = 0;
+    my $last_report = time;
+    my $total_size_hr = sprintf "%0.4G %s", Directory::Simplify::Utils::hr_size(&sum(map { $_->{size} } @files) or 0);
+    my $cb;
+    if ($print_progress) {
+        printf STDERR "\e\x{37}";
+        $cb = sub {
+            my ($file, $now) = (shift, time);
+            $processed_bytes += $file->{size};
+            if ($now >= $last_report + $report_every) {
+                printf STDERR "\e\x{38}%8s / %8s",
+                    (sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)),
+                    $total_size_hr;
+                $last_report = $now;
+            }
+        };
+    }
+    $filehash->add({
+        files => \@files,
+        callback => $cb,
+    });
+    print STDERR "done.\n"
+        if $verbose;
+
+    my $generator = Directory::Simplify::Instruction::Generator->new(
+        filehash => $filehash,
+        min_size => ($include_zero_length_files ? 0 : 1),
+    );
+
+    my $freed_bytes = 0;
+
+    foreach my $inst ($generator->instructions) {
+        print STDERR $inst, "\n" if $verbose;
+        $inst->run;
+        $freed_bytes += $inst->bytes_freed;
+    }
+
+    printf STDERR "freed %s bytes (%0.4G %s)\n",
+        Directory::Simplify::Utils::addcommas($freed_bytes),
+        Directory::Simplify::Utils::hr_size($freed_bytes)
+            if $print_freed or $verbose;
+}
+
diff --git a/simplify_static_dir.pl b/simplify_static_dir.pl
deleted file mode 100755
index 04e88e0..0000000
--- a/simplify_static_dir.pl
+++ /dev/null
@@ -1,611 +0,0 @@
-#!/usr/bin/perl
-# vi: et sts=4 sw=4 ts=4
-
-package main;
-use strict;
-use warnings;
-use List::Util qw/ sum /;
-
-our $VERSION = '3.1.0';
-
-=pod
-
-=head1 NAME
-
-simplify_static_dir - optimize directories for size by combining identical
-files
-
-=head1 SYNOPSIS
-
-B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]...
-
-=head1 DESCRIPTION
-
-The more files this script can process at once, the better. It maintains an
-internal hash of files indexed by their SHA1 checksum. When it finds a
-collision it removes the file with the least amount of file system links, then
-creates a hard link to the other in its place. The larger the number of files
-scanned, the more likely it is that there will be collisions.
-
-There are failsafes in place, though. If somehow two files' SHA1 checksums are
-identical, but the file sizes do not match, this program will error out (and
-you can go ahead and report the collision; it may be worth money).
-
-There are other features built in as well. Following the logic that unique
-data, once created has the attribute of being unique of that point in time,
-this script will copy the timestamp of the older file onto the newer file just
-before it makes the hard link. Therefore, many times, simplified directories
-will have the appearance of being older than they actually are.
-
-From the perspective of any program reading files from the simplified
-directories, the files will lookB<*> and behave the same way as the initial
-state.
-
-B<*> Except for having an increased number of hard links.
-
-=head1 OPTIONS
-
-=over
-
-=item B<-v>
-
-Verbose output.
-
-=item B<-f>
-
-Print a sum of the number of freed bytes.
-
-=item B<-m> I<REGEX>
-
-Only match file paths matching I<REGEX>.
-
-=item B<-M> I<REGEX>
-
-Exclude file paths matching I<REGEX>.
-
-=item B<-z>
-
-Include zero-length files in search. Normally they are ignored (you don't save
-diskspace by hard-linking empty files).
-
-=item B<--help>
-
-Output this help message and exit.
-
-=item B<--version>
-
-Output version information and exit.
-
-=back
-
-By default, scans the current directory.
-
-=head1 COPYRIGHT
-
-Copyright (C) 2010-2022 Dan Church.
-
-License GPLv3: GNU GPL version 3.0 (L<https://www.gnu.org/licenses/gpl-3.0.html>)
-
-with Commons Clause 1.0 (L<https://commonsclause.com/>).
-
-This is free software: you are free to change and redistribute it.
-
-There is NO WARRANTY, to the extent permitted by law.
-
-You may NOT use this software for commercial purposes.
-
-=head1 AUTHOR
-
-Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
-
-=cut
-
-use File::Find qw/ find /;
-use Getopt::Std qw/ getopts /;
-use Pod::Usage qw/ pod2usage /;
-
-sub HELP_MESSAGE {
-    my $fh = shift;
-    &pod2usage(
-        -verbose => 1,
-        -exitval => 0,
-    );
-}
-
-MAIN: {
-    &getopts('vfm:M:z', \ my %opts)
-        || &pod2usage(
-            -exitval => 2,
-            -msg => "Try '$0 --help' for more information",
-        );
-
-    my $verbose = defined $opts{v};
-    my $print_freed = defined $opts{f};
-    my $files_match = $opts{m} || '';
-    my $files_exclude = $opts{M} || '';
-    my $include_zero_length_files = defined $opts{z};
-    my $print_progress = $verbose;
-
-    # correct relative paths
-    # OR if no directories given, search the current directory
-    my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));
-
-    my @files;
-    print STDERR 'Finding files...'
-        if $verbose;
-
-    &find(sub {
-        # outright skip directories (don't report skip)
-        return if -d $File::Find::name;
-
-        # skip non-existent files and links
-        unless (-f $File::Find::name && ! -l $File::Find::name) {
-            return;
-        }
-
-        push @files, Directory::Simplify::File->new($File::Find::name);
-    }, @dirs_to_process);
-
-    printf STDERR "%d files found",
-        scalar @files
-        if $verbose;
-
-    # Limit to or exclude file patterns specified by `-m' or `-M', respectively
-    #
-    # Truth table:
-    # -m matches    | -M is used & matches  | !return?
-    # 0             | 0                     | 0
-    # 0             | 1                     | 0
-    # 1             | 0                     | 1
-    # 1             | 1                     | 0
-    # note: m// will match everything
-    my $file_ct_before_filter = scalar @files;
-    @files = grep {
-        $_->{rel_name} =~ $files_match
-    } @files;
-    if ($file_ct_before_filter != scalar @files) {
-        printf STDERR " (%d files filtered by -m rule)",
-            $file_ct_before_filter - scalar @files
-            if $verbose;
-    }
-    if (length $files_exclude) {
-        $file_ct_before_filter = scalar @files;
-        @files = grep {
-            not $_->{rel_name} =~ $files_exclude
-        } @files;
-        if ($file_ct_before_filter != scalar @files) {
-            printf STDERR " (%d files filtered by -M rule)",
-                $file_ct_before_filter - scalar @files
-                if $verbose;
-        }
-    }
-
-    # Shortcut: Only generate hashes and inspect files that do not have a
-    # unique size. The reasoning being that file sizes do not match, there's no
-    # possible way those two files can have the same contents.
-    my %file_sizes;
-    ++$file_sizes{$_->{size}} foreach @files;
-    @files = grep {
-        $file_sizes{$_->{size}} > 1
-    } @files;
-
-    printf STDERR " (%d candidates).\n",
-        scalar @files
-        if $verbose;
-
-    unless (@files) {
-        printf STDERR "Nothing to do.\n";
-        exit 0;
-    }
-
-    print STDERR "Generating hashes..." if $verbose;
-    my $filehash = Directory::Simplify::FileHash->new;
-    my $report_every = 1; # seconds
-    my $processed_bytes = 0;
-    my $last_report = time;
-    my $total_size_hr = sprintf "%0.4G %s", Directory::Simplify::Utils::hr_size(&sum(map { $_->{size} } @files) or 0);
-    my $cb;
-    if ($print_progress) {
-        printf STDERR "\e\x{37}";
-        $cb = sub {
-            my ($file, $now) = (shift, time);
-            $processed_bytes += $file->{size};
-            if ($now >= $last_report + $report_every) {
-                printf STDERR "\e\x{38}%8s / %8s",
-                    (sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)),
-                    $total_size_hr;
-                $last_report = $now;
-            }
-        };
-    }
-    $filehash->add({
-        files => \@files,
-        callback => $cb,
-    });
-    print STDERR "done.\n"
-        if $verbose;
-
-    my $generator = Directory::Simplify::Instruction::Generator->new(
-        filehash => $filehash,
-        min_size => ($include_zero_length_files ? 0 : 1),
-    );
-
-    my $freed_bytes = 0;
-
-    foreach my $inst ($generator->instructions) {
-        print STDERR $inst, "\n" if $verbose;
-        $inst->run;
-        $freed_bytes += $inst->bytes_freed;
-    }
-
-    printf STDERR "freed %s bytes (%0.4G %s)\n",
-        Directory::Simplify::Utils::addcommas($freed_bytes),
-        Directory::Simplify::Utils::hr_size($freed_bytes)
-            if $print_freed or $verbose;
-}
-
-package Directory::Simplify::Utils;
-use strict;
-use warnings;
-
-sub addcommas {
-    my @added;
-    foreach my $num (@_) {
-        # don't split anything after the decimal
-        my @parts = split /\./, $num;
-        while ($parts[0] =~ s/(\d)(\d{3}(?:\D|$))/$1,$2/) {
-        }
-        push @added, (join '.', @parts);
-    }
-    wantarray ? @added : $added[0]
-}
-
-sub hr_size {
-    my $sz = shift;
-    my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
-    my $fact = 1024;
-    my $thresh = 0.1;
-    my @ret;
-    foreach my $exp (reverse 0 .. $#sizes) {
-        if ($sz > (1 - $thresh) * $fact ** $exp) {
-            @ret = ($sz / $fact ** $exp, $sizes[$exp]);
-            last;
-        }
-    }
-
-    # default to ($sz, 'bytes')
-    @ret = ($sz, $sizes[0]) unless @ret;
-
-    wantarray ? @ret : "@ret"
-}
-
-sub shell_quote {
-    # shell-escape argument for inclusion in non-interpolated single quotes
-    my @transformed = map {
-        (my $out = $_)
-            =~ s/'/'\\''/g;
-        "'$out'";
-    } @_;
-    wantarray ? @transformed : $transformed[0];
-}
-
-package Directory::Simplify::Instruction::Hardlink;
-use strict;
-use warnings;
-use overload '""' => 'as_string';
-
-sub new {
-    my $class = shift;
-    return bless {
-        freed => 0,
-        @_,
-    }, $class;
-}
-
-sub run {
-    my $self = shift;
-    # hard link the files
-
-    unless (unlink $self->{target}->{name}) {
-        die "Failed to remove file `$self->{target}->{name}': $!\n";
-    }
-    unless (link $self->{source}->{name}, $self->{target}->{name}) {
-        die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!";
-    }
-    # bookkeeping
-    ++$self->{source}->{nlink};
-    if (--$self->{target}->{nlink} == 0) {
-        $self->{freed} = $self->{target}->{size};
-    }
-}
-
-sub bytes_freed {
-    my $self = shift;
-    return $self->{freed};
-}
-
-sub as_string {
-    my $self = shift;
-    return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
-}
-
-package Directory::Simplify::Instruction::CopyTimestamp;
-use strict;
-use warnings;
-use overload '""' => 'as_string';
-
-sub new {
-    my $class = shift;
-    return bless {
-        @_,
-    }, $class;
-}
-
-sub run {
-    my $self = shift;
-    # preserve older time stamp
-    utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name};
-}
-
-sub bytes_freed {
-    return 0;
-}
-
-sub as_string {
-    my $self = shift;
-    return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
-}
-
-package Directory::Simplify::Instruction::Generator;
-use strict;
-use warnings;
-use overload '""' => 'as_string';
-use File::Basename qw/ dirname /;
-use File::Compare qw/ compare /;
-
-sub new {
-    my $class = shift;
-    return bless {
-        filehash => undef,
-        min_size => 1,
-        @_,
-    }, $class;
-}
-
-sub as_string {
-    my $self = shift;
-    join "\n", $self->instructions;
-}
-
-sub buckets {
-    my $self = shift;
-
-    my @candidate_lists = $self->{filehash}->entries;
-
-    my @buckets;
-    foreach my $candidates (@candidate_lists) {
-        my @ca = @{$candidates}; # make a clone
-        my @these_buckets;
-
-        # at least two files needed to link together
-        if (@ca > 1) {
-            ELIMINATOR: while (@ca) {
-                my $entry = shift @ca;
-
-                next ELIMINATOR if $self->_entry_should_be_skipped($entry);
-
-                foreach my $bucket_idx (0 .. $#these_buckets) {
-                    if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) {
-                        push @{$these_buckets[$bucket_idx]}, $entry;
-                        next ELIMINATOR;
-                    }
-                }
-                # didn't find a bucket (even though the hash matched!)
-                # -> new bucket
-                push @these_buckets, [$entry];
-            }
-        }
-        push @buckets, @these_buckets;
-    }
-
-    @buckets
-}
-
-sub _entry_should_be_skipped {
-    my ($self, $entry_a) = @_;
-    # too small to be hard-linked
-    if ($entry_a->{size} < $self->{min_size}) {
-        return 1;
-    }
-    return 0;
-}
-
-sub _entries_are_hard_linkable {
-    my ($entry_a, $entry_b) = @_;
-
-    # obviously, if the sizes aren't the same, they're not the same file
-    unless (&_entries_sizes_match($entry_a, $entry_b)) {
-        return 0;
-    }
-
-    # they're the same file, don't try it
-    if (&_entries_are_already_hard_linked($entry_a, $entry_b)) {
-        return 0;
-    }
-
-    if (&_entries_contents_match($entry_a, $entry_b)) {
-        return 1;
-    }
-    return 0;
-}
-
-sub oldest_mtime {
-    my $self = shift;
-    return sort {
-        $a->{mtime} <=> $b->{mtime}
-    } @_;
-}
-
-sub more_linked {
-    my $self = shift;
-    my %warned;
-    return sort {
-        if (! -w &dirname($a->{name})) {
-            warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++;
-            return 1; # favor a -> front
-        } elsif (! -w &dirname($b->{name})) {
-            warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++;
-            return -1; # favor b -> front
-        }
-        $b->{nlink} <=> $a->{nlink}
-    } @_;
-}
-
-sub _entries_are_already_hard_linked {
-    my ($entry_a, $entry_b) = @_;
-
-    if ($entry_a->{ino} == $entry_b->{ino} and
-        $entry_a->{dev} == $entry_b->{dev}) {
-        return 1;
-    }
-
-    return 0;
-}
-
-sub _entries_sizes_match {
-    my ($entry_a, $entry_b) = @_;
-    if ($entry_a->{size} != $entry_b->{size}) {
-        return 0;
-    }
-    return 1;
-}
-sub _entries_contents_match {
-    my ($entry_a, $entry_b) = @_;
-
-    my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name}));
-
-    # warn about hash collision
-    unless ($contents_same) {
-        warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n  (don't panic)\n";
-    }
-    return $contents_same;
-}
-
-# generate hardlink instructions
-sub instructions {
-    require Scalar::Util;
-    my $self = shift;
-
-    # start generating instructions
-    my @inst;
-    foreach my $bucket ($self->buckets) {
-
-        # of the bucket, find the oldest timestamp
-        my ($oldest_entry) = $self->oldest_mtime(@{$bucket});
-
-        # of the bucket, find the file most embedded in the file system
-        my @to_link = $self->more_linked(@{$bucket});
-        my $most_linked_entry = shift @to_link;
-        foreach my $entry (@to_link) {
-            # XXX there shouldn't be a need to update entries' link counts,
-            # since this generates all the instructions at once
-            push @inst, Directory::Simplify::Instruction::Hardlink->new(
-                source => $most_linked_entry,
-                target => $entry,
-            );
-            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
-                source => $oldest_entry,
-                target => $entry,
-            );
-        }
-        if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) {
-            # most_linked_entry should get its timestamp updated
-            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
-                source => $oldest_entry,
-                target => $most_linked_entry,
-            );
-        }
-    }
-    @inst
-}
-
-package Directory::Simplify::File;
-use strict;
-use warnings;
-require Cwd;
-
-sub new {
-    my $class = shift;
-    my $rel_name = shift;
-    my $self = bless {
-        rel_name => $rel_name,
-        name => Cwd::abs_path($rel_name),
-    }, $class;
-    (@{$self}{qw/ dev ino mode nlink uid gid rdev size
-                atime mtime ctime blksize blocks /})
-        = lstat $self->{name};
-    $self
-}
-
-sub hash {
-    my $self = shift;
-    unless (defined $self->{_hash}) {
-        require Digest::SHA;
-        my $ctx = Digest::SHA->new;
-        $ctx->addfile($self->{name});
-        $self->{_hash} = $ctx->hexdigest;
-    }
-    $self->{_hash}
-}
-
-package Directory::Simplify::FileHash;
-use strict;
-use warnings;
-
-=head1 DESCRIPTION
-
-Object for abstracting management of a hashed filesystem
-
-=cut
-
-sub new {
-    my $class = shift;
-    return bless {
-        _entries => {},
-        _files_in_hash => {},
-        @_,
-    }, $class;
-}
-
-sub add {
-    my $self = shift;
-    my (@files, $callback);
-    if (ref $_[0] eq 'HASH') {
-        # Called method like { files => [] }
-        my %opts = %{$_[0]};
-        @files = @{$opts{files}};
-        $callback = $opts{callback};
-    } else {
-        @files = @_;
-    }
-    foreach my $file (@files) {
-        unless (ref $file eq 'Directory::Simplify::File') {
-            $file = Directory::Simplify::File->new($file);
-        }
-        unless ($self->{_files_in_hash}->{$file->{name}}) {
-            my $hash = $file->hash;
-
-            unless (defined $self->{_entries}->{$hash}) {
-                $self->{_entries}->{$hash} = [];
-            }
-            push @{$self->{_entries}->{$hash}}, $file;
-            &{$callback}($file) if ref $callback eq 'CODE';
-        }
-        $self->{_files_in_hash}->{$file->{name}} = 1;
-    }
-}
-
-sub entries {
-    my $self = shift;
-    values %{$self->{_entries}}
-}
diff --git a/util/squash b/util/squash
new file mode 100755
index 0000000..3403c19
--- /dev/null
+++ b/util/squash
@@ -0,0 +1,60 @@
+#!/usr/bin/perl
+# vi: et sts=4 sw=4 ts=4
+
+# Squashes together the parts of the app into the single script.
+# (Adapted from the script that squashes App::Ack - see https://github.com/beyondgrep/ack3)
+use warnings;
+use strict;
+
+my $code;
+for my $arg (@ARGV) {
+    my $filename = $arg;
+    if ($arg =~ /::/) {
+        my $key = "$arg.pm";
+        $key =~ s{::}{/}g;
+        $filename = $INC{$key} or die "Can't find the file for $arg";
+    }
+
+    warn "Reading $filename\n";
+    open my $fh, '<', $filename or die "Can't open $filename: $!";
+    my $in_pod = 0;
+    my $in_ignore_section = 0;
+    while (<$fh>) {
+        if (/#.*:squash-ignore-start:$/) {
+            $in_ignore_section = 1;
+            next;
+        } elsif (/#.*:squash-ignore-end:$/) {
+            $in_ignore_section = 0;
+            next;
+        }
+        next if $in_ignore_section;
+        next if /#.*:squash-ignore-line:$/;
+
+        next if /^\s*1;$/;
+
+        if ($filename =~ /\.pm$/) {
+            # See if we're in module POD blocks
+            if (/^=(\w+)/) {
+                $in_pod = ($1 ne 'cut');
+                next;
+            }
+            elsif ($in_pod) {
+                next;
+            }
+            next if /^# vi:/;
+        }
+
+        # Remove Perl::Critic comments.
+        # I'd like to remove all comments, but this is a start
+        s{\s*##.+critic.*}{};
+        $code .= $_;
+    }
+    close $fh;
+}
+
+# Remove repeated newlines between paragraphs
+$code =~ s/\n\n+/\n\n/gs;
+
+print $code;
+
+exit 0;