From c77fc7a205bcaebdc51acb689f0fd50c376e93b6 Mon Sep 17 00:00:00 2001
From: Dan Church <amphetamachine@gmail.com>
Date: Sun, 21 Jan 2018 15:48:49 -0600
Subject: [PATCH] Update script

Follows a more logical process of deciding what to do, then doing it.
---
 CHANGELOG.md           |   6 +
 simplify_static_dir.pl | 551 ++++++++++++++++++++++-------------------
 2 files changed, 304 insertions(+), 253 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 126185e..0d276f5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+## [3.0.0]
+
+### Changed
+
+- Re-order operations to make the linking happen at the very end
+
 ## [2.0.0]
 
 ### Fixed
diff --git a/simplify_static_dir.pl b/simplify_static_dir.pl
index 83582d7..19c5482 100755
--- a/simplify_static_dir.pl
+++ b/simplify_static_dir.pl
@@ -1,9 +1,12 @@
 #!/usr/bin/perl
 # vi: et sts=4 sw=4 ts=4
+
+package main;
 use strict;
 use warnings;
+require Cwd;
 
-our $VERSION = '2.0.0';
+our $VERSION = '3.0.0';
 
 =pod
 
@@ -117,142 +120,86 @@ EOF
     exit 0;
 }
 
-my %opts = (
-    v => 0,
-    f => 0,
-    m => '',
-    M => '',
-    z => 0,
-);
+MAIN: {
+    my %opts = (
+        v => 0,
+        f => 0,
+        m => '',
+        M => '',
+        z => 0,
+    );
 
-&getopts('vfm:M:z', \%opts);
+    &getopts('vfm:M:z', \%opts);
 
-my $filehash = new Directory::Simplify::FileHash;
+    # correct relative paths
+    # OR if no directories given, search the current directory
+    my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));
 
-# include zero-length files if wanted (`-z')
-$filehash->min_linkable_size(0)
-    if $opts{z};
-
-# correct relative paths
-# OR if no directories given, search the current directory
-push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV;
-
-my $freed_bytes = 0;
-
-&find(\&findexec, @ARGV);
-
-printf STDERR "freed %d bytes (%0.4G %s)\n",
-    $freed_bytes, &hr_size($freed_bytes)
-        if $opts{f} or $opts{v};
-
-sub findexec {
-    # outright skip directories (don't report skip)
-    return if -d $File::Find::name;
-
-    # limit to or exclude file patterns specified by `-m' or `-M',
-    # respectively
-
-    # truth table
-    # -m matches    | -M is used & matches  | !return?
-    # 0     | 0         | 0
-    # 0     | 1         | 0
-    # 1     | 0         | 1
-    # 1     | 1         | 0
-    # note: m// will match everything
-    unless ($File::Find::name =~ m/$opts{m}/ and
-        !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) {
-
-        print STDERR "Skipping path `$File::Find::name'\n"
-            if $opts{v};
-        return;
-    }
-
-    # skip non-existent files and links
-    unless (-f $File::Find::name && ! -l $File::Find::name) {
-        return;
-    }
-
-    my $entry = $filehash->make_entry($File::Find::name);
-
-    my @linkable = $filehash->find_hardlinkable($entry);
-    if (@linkable) {
-        &hardlink_entries($entry, @linkable);
-    }
-    $filehash->add_entry($entry);
-}
-
-sub hardlink_entries {
-    my ($entry, @linkable) = @_;
-
-    # only one of the linkable entries should suffice
-    my $linking_with = $linkable[0];
-
-    # calculate the timestamp of the resulting file
-    my ($atime, $mtime) = @{(
-        $filehash->oldest_mtime($entry, $linking_with)
-    )[0]}{qw/ atime mtime /};
-
-    # find the file less embedded in the file system
-    my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with);
-
-    printf STDERR "removing file `%s'\n", $less_linked->{name}
+    my @files;
+    print STDERR 'Finding files...'
         if $opts{v};
 
-    unless (unlink $less_linked->{name}) {
-        printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n",
-            $less_linked->{name},
-            $!,
-            $more_linked->{name};
+    &find(sub {
+        # outright skip directories (don't report skip)
+        return if -d $File::Find::name;
 
-        # if we can't do this, there's no point in continuing
-        unless (unlink $more_linked->{name}) {
-            printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n",
-                $more_linked->{name},
-                $!;
+        # limit to or exclude file patterns specified by `-m' or `-M',
+        # respectively
 
+        # truth table
+        # -m matches    | -M is used & matches  | !return?
+        # 0     | 0         | 0
+        # 0     | 1         | 0
+        # 1     | 0         | 1
+        # 1     | 1         | 0
+        # note: m// will match everything
+        unless ($File::Find::name =~ m/$opts{m}/ and
+            !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) {
+
+            print STDERR "Skipping path `$File::Find::name'\n"
+                if $opts{v};
             return;
         }
 
-        # the ol' switcheroo
-        ($more_linked, $less_linked) =
-        ($less_linked, $more_linked);
+        # skip non-existent files and links
+        unless (-f $File::Find::name && ! -l $File::Find::name) {
+            return;
+        }
+
+        push @files, $File::Find::name;
+    }, @dirs_to_process);
+
+    printf STDERR "%s files found.\nGenerating hashes...", scalar @files
+        if $opts{v};
+
+    my $filehash = Directory::Simplify::FileHash->new;
+    $filehash->add(@files);
+    print STDERR "done.\n"
+        if $opts{v};
+
+    my $generator = Directory::Simplify::Instruction::Generator->new(
+        filehash => $filehash,
+        min_size => ($opts{z} ? 0 : 1),
+    );
+
+    my $freed_bytes = 0;
+
+    foreach my $inst ($generator->instructions) {
+        print STDERR $inst, "\n" if $opts{v};
+        $inst->run;
+        $freed_bytes += $inst->bytes_freed;
     }
 
-    # we unlinked it or failed out
-    $freed_bytes += $less_linked->{size}
-        unless $less_linked->{nlink} > 1;
-
-    printf STDERR "hard linking `%s' => `%s'\n",
-        $less_linked->{name}, $more_linked->{name}
-    if $opts{v};
-
-    # hard link the files
-    link $more_linked->{name},
-    $less_linked->{name};
-
-    # preserve older time stamp
-    utime $atime, $mtime, $more_linked->{name};
-    $more_linked->{atime} = $atime;
-    $more_linked->{mtime} = $mtime;
-
-    # update link count in our hash to reflect the file system (referenced)
-    ++$more_linked->{nlink};
-
-    # update old entry with the info from the new one
-    foreach my $copy_attr (qw/
-        ino
-        nlink
-        mode
-        uid
-        gid
-        atime
-        mtime
-        ctime
-    /) {
-        $less_linked->{$copy_attr} = $more_linked->{$copy_attr};
-    }
+    printf STDERR "freed %d bytes (%0.4G %s)\n",
+        $freed_bytes,
+        Directory::Simplify::Utils::hr_size($freed_bytes)
+            if $opts{f} or $opts{v};
 }
 
+package Directory::Simplify::Utils;
+use strict;
+use warnings;
+
 sub hr_size {
     my $sz = shift;
     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
@@ -272,137 +219,148 @@ sub hr_size {
     wantarray ? @ret : "@ret"
 }
 
-package Directory::Simplify::FileHash;
+sub shell_quote {
+    # shell-escape argument for inclusion in non-interpolated single quotes
+    my @transformed = map {
+        (my $out = $_)
+            =~ s/'/'\\''/g;
+        "'$out'";
+    } @_;
+    wantarray ? @transformed : $transformed[0];
+}
+
+package Directory::Simplify::Instruction::Hardlink;
 use strict;
 use warnings;
-
-=head1 DESCRIPTION
-
-Object for abstracting management of a hashed filesystem
-
-=cut
+use overload '""' => 'as_string';
 
 sub new {
-    my ($class, $self) = (shift, {});
-
-    $self->{_files} = {};
-
-    require Digest::SHA;
-    $self->{_ctx} = Digest::SHA->new;
-
-    # default options
-    $self->{_min_linkable_size} = 1;
-
-    bless $self, $class
+    my $class = shift;
+    return bless {
+        freed => 0,
+        @_,
+    }, $class;
 }
 
-=head2 min_linkable_size($bytes)
-
-Set or get the minimum size of files to be considered hard-linkable. Default is 1.
-
-=cut
-
-sub min_linkable_size {
+sub run {
     my $self = shift;
-    my $in = shift;
-    if (defined $in) {
-        $self->{_min_linkable_size} = $in;
+    # hard link the files
+
+    unless (unlink $self->{target}->{name}) {
+        die "Failed to remove file `$self->{target}->{name}': $!\n";
     }
-    $self->{_min_linkable_size}
-}
-
-=head2 make_entry($filename)
-
-=cut
-
-sub make_entry {
-    my $self = shift;
-    my ($filename) = @_;
-    # organize results from lstat into hash
-    my $entry = {};
-    (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
-            atime mtime ctime blksize blocks /})
-        = ($filename, lstat $filename);
-
-    $entry->{hash} = $self->_hash($filename);
-
-    $entry
-}
-
-=head2 add_entry($entry)
-
-=cut
-
-sub add_entry {
-    my $self = shift;
-    my ($entry) = @_;
-
-    my $hash = $entry->{hash};
-
-    unless (defined $self->{_files}->{$hash}) {
-        $self->{_files}->{$hash} = [];
+    unless (link $self->{source}->{name}, $self->{target}->{name}) {
+        die "Failed to hard link `$self->{source}->{name}' => `$self->{target}->{name}': $!";
+    }
+    # bookkeeping
+    ++$self->{source}->{nlink};
+    if (--$self->{target}->{nlink} == 0) {
+        $self->{freed} = $self->{target}->{size};
     }
-    push @{$self->{_files}->{$hash}}, $entry;
 }
 
-sub find_hardlinkable {
+sub bytes_freed {
     my $self = shift;
-    my ($entry) = @_;
+    return $self->{freed};
+}
 
-    my $hash = $entry->{hash};
+sub as_string {
+    my $self = shift;
+    return sprintf 'ln -sf %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
+}
 
-    # no matching entries
-    unless (defined $self->{_files}->{$hash}) {
-        return ();
-    }
+package Directory::Simplify::Instruction::CopyTimestamp;
+use strict;
+use warnings;
+use overload '""' => 'as_string';
 
-    my @matches;
-    foreach my $ent (@{$self->{_files}->{$hash}}) {
-        if ($self->_entries_are_hard_linkable($entry, $ent)) {
-            push @matches, $ent;
+sub new {
+    my $class = shift;
+    return bless {
+        @_,
+    }, $class;
+}
+
+sub run {
+    my $self = shift;
+    # preserve older time stamp
+    utime $self->{source}->{atime}, $self->{source}->{mtime}, $self->{target}->{name};
+}
+
+sub bytes_freed {
+    return 0;
+}
+
+sub as_string {
+    my $self = shift;
+    return sprintf 'touch -r %s %s', Directory::Simplify::Utils::shell_quote($self->{source}->{name}, $self->{target}->{name});
+}
+
+package Directory::Simplify::Instruction::Generator;
+use strict;
+use warnings;
+use overload '""' => 'as_string';
+use File::Basename qw/ dirname /;
+use File::Compare qw/ compare /;
+
+sub new {
+    my $class = shift;
+    return bless {
+        filehash => undef,
+        min_size => 1,
+        @_,
+    }, $class;
+}
+
+sub as_string {
+    my $self = shift;
+    join "\n", $self->instructions;
+}
+
+sub buckets {
+    my $self = shift;
+
+    my @candidate_lists = $self->{filehash}->entries;
+
+    my @buckets;
+    foreach my $candidates (@candidate_lists) {
+        my @ca = @{$candidates}; # make a clone
+        my @these_buckets;
+
+        # at least two files needed to link together
+        if (@ca > 1) {
+            ELIMINATOR: while (@ca) {
+                my $entry = shift @ca;
+
+                next ELIMINATOR if $self->_entry_should_be_skipped($entry);
+
+                foreach my $bucket_idx (0 .. $#these_buckets) {
+                    if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) {
+                        push @{$these_buckets[$bucket_idx]}, $entry;
+                        next ELIMINATOR;
+                    }
+                }
+                # didn't find a bucket (even though the hash matched!)
+                # -> new bucket
+                push @these_buckets, [$entry];
+            }
         }
+        push @buckets, @these_buckets;
     }
 
-    return @matches;
-
+    @buckets
 }
 
-=head2 oldest($entry_a, $entry_b, ...)
-
-Find the file less embedded in the file system.
-
-=cut
-
-sub less_linked {
-    my $self = shift;
-    return sort
-        {$a->{nlink} <=> $b->{nlink}}
-        @_;
-}
-
-=head2 oldest($entry_a, $entry_b, ...)
-
-Find the entry with the oldest time stamp.
-
-=cut
-
-sub oldest_mtime {
-    my $self = shift;
-
-    return sort
-        {$a->{mtime} <=> $b->{mtime}}
-        @_;
-}
-
-sub _hash {
-    my $self = shift;
-    my ($filename) = @_;
-    $self->{_ctx}->addfile($filename);
-    return $self->{_ctx}->hexdigest;
+sub _entry_should_be_skipped {
+    my ($self, $entry_a) = @_;
+    # too small to be hard-linked
+    if ($entry_a->{size} < $self->{min_size}) {
+        return 1;
+    }
+    return 0;
 }
 
 sub _entries_are_hard_linkable {
-    my $self = shift;
     my ($entry_a, $entry_b) = @_;
 
     # obviously, if the sizes aren't the same, they're not the same file
@@ -410,15 +368,7 @@ sub _entries_are_hard_linkable {
         return 0;
     }
 
-    # too small to be hard-linked
-    if ($entry_a->{size} < $self->min_linkable_size) {
-        return 0;
-    }
-
     # they're the same file, don't try it
-    if (&_entries_are_same_filename($entry_a, $entry_b)) {
-        return 0;
-    }
     if (&_entries_are_already_hard_linked($entry_a, $entry_b)) {
         return 0;
     }
@@ -429,13 +379,26 @@ sub _entries_are_hard_linkable {
     return 0;
 }
 
-sub _entries_are_same_filename {
-    my ($entry_a, $entry_b) = @_;
+sub oldest_mtime {
+    my $self = shift;
+    return sort {
+        $a->{mtime} <=> $b->{mtime}
+    } @_;
+}
 
-    if ($entry_a->{name} eq $entry_b->{name}) {
-        return 1;
-    }
-    return 0;
+sub more_linked {
+    my $self = shift;
+    my %warned;
+    return sort {
+        if (! -w &dirname($a->{name})) {
+            warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++;
+            return 1; # favor a -> front
+        } elsif (! -w &dirname($b->{name})) {
+            warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++;
+            return -1; # favor b -> front
+        }
+        $b->{nlink} <=> $a->{nlink}
+    } @_;
 }
 
 sub _entries_are_already_hard_linked {
@@ -456,17 +419,10 @@ sub _entries_sizes_match {
     }
     return 1;
 }
-
 sub _entries_contents_match {
     my ($entry_a, $entry_b) = @_;
 
-    # also, if the hashes aren't the same, they cannot be the same file
-    if ($entry_a->{hash} ne $entry_b->{hash}) {
-        return 0;
-    }
-
-    use File::Compare   qw/ compare /;
-    my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name}));
+    my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name}));
 
     # warn about hash collision
     unless ($contents_same) {
@@ -474,3 +430,92 @@ sub _entries_contents_match {
     }
     return $contents_same;
 }
+
+# generate hardlink instructions
+sub instructions {
+    require Scalar::Util;
+    my $self = shift;
+
+    # start generating instructions
+    my @inst;
+    foreach my $bucket ($self->buckets) {
+
+        # of the bucket, find the oldest timestamp
+        my ($oldest_entry) = $self->oldest_mtime(@{$bucket});
+
+        # of the bucket, find the file most embedded in the file system
+        my @to_link = $self->more_linked(@{$bucket});
+        my $most_linked_entry = shift @to_link;
+        foreach my $entry (@to_link) {
+            # XXX there shouldn't be a need to update entries' link counts,
+            # since this generates all the instructions at once
+            push @inst, Directory::Simplify::Instruction::Hardlink->new(
+                source => $most_linked_entry,
+                target => $entry,
+            );
+            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
+                source => $oldest_entry,
+                target => $entry,
+            );
+        }
+        if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) {
+            # most_linked_entry should get its timestamp updated
+            push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
+                source => $oldest_entry,
+                target => $most_linked_entry,
+            );
+        }
+    }
+    @inst
+}
+
+package Directory::Simplify::FileHash;
+use strict;
+use warnings;
+require Digest::SHA;
+
+=head1 DESCRIPTION
+
+Object for abstracting management of a hashed filesystem
+
+=cut
+
+sub new {
+    my $class = shift;
+    return bless {
+        _entries => {},
+        _files_in_hash => {},
+        @_,
+    }, $class;
+}
+
+sub add {
+    require Cwd;
+    my $self = shift;
+    my $ctx = $self->{_ctx};
+    unless (defined $ctx) {
+        $ctx = $self->{_ctx} = Digest::SHA->new;
+    }
+    foreach my $filename (@_) {
+        $filename = Cwd::abs_path($filename);
+        unless ($self->{_files_in_hash}->{$filename}) {
+            my $entry = {};
+            (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
+                atime mtime ctime blksize blocks /})
+            = ($filename, lstat $filename);
+
+            $ctx->addfile($filename);
+            my $hash = $ctx->hexdigest;
+            unless (defined $self->{_entries}->{$hash}) {
+                $self->{_entries}->{$hash} = [];
+            }
+            push @{$self->{_entries}->{$hash}}, $entry;
+        }
+        $self->{_files_in_hash}->{$filename} = 1;
+    }
+}
+
+sub entries {
+    my $self = shift;
+    values %{$self->{_entries}}
+}