simplify_static_dir/simplify_static_dir-main.pl

#!/usr/bin/perl

package main;
use strict;
use warnings;
use List::Util qw/ sum /;

our $VERSION = '3.1.0';

=pod

=head1 NAME

simplify_static_dir - optimize directories for size by combining identical
files

=head1 SYNOPSIS

B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]...

=head1 DESCRIPTION

The more files this script can process at once, the better. It maintains an
internal hash of files indexed by their SHA1 checksum. When it finds a
collision it removes the file with the least amount of file system links, then
creates a hard link to the other in its place. The larger the number of files
scanned, the more likely it is that there will be collisions.

There are failsafes in place, though. If somehow two files' SHA1 checksums are
identical, but the file sizes do not match, this program will error out (and
you can go ahead and report the collision; it may be worth money).

There are other features built in as well. Following the logic that unique
data, once created has the attribute of being unique of that point in time,
this script will copy the timestamp of the older file onto the newer file just
before it makes the hard link. Therefore, many times, simplified directories
will have the appearance of being older than they actually are.

From the perspective of any program reading files from the simplified
directories, the files will lookB<*> and behave the same way as the initial
state.

B<*> Except for having an increased number of hard links.

=head1 OPTIONS

=over

=item B<-v>

Verbose output.

=item B<-f>

Print a sum of the number of freed bytes.

=item B<-m> I<REGEX>

Only match file paths matching I<REGEX>.

=item B<-M> I<REGEX>

Exclude file paths matching I<REGEX>.

=item B<-z>

Include zero-length files in search. Normally they are ignored (you don't save
diskspace by hard-linking empty files).

=item B<--help>

Output this help message and exit.

=item B<--version>

Output version information and exit.

=back

By default, scans the current directory.

=head1 COPYRIGHT

Copyright (C) 2010-2022 Dan Church.

License GPLv3: GNU GPL version 3.0 (L<https://www.gnu.org/licenses/gpl-3.0.html>)

with Commons Clause 1.0 (L<https://commonsclause.com/>).

This is free software: you are free to change and redistribute it.

There is NO WARRANTY, to the extent permitted by law.

You may NOT use this software for commercial purposes.

=head1 AUTHOR

Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>

=cut

use File::Find qw/ find /;
use Getopt::Std qw/ getopts /;
use Pod::Usage qw/ pod2usage /;

# :squash-remove-start:
# (this prepends to the load path)
use FindBin qw//;
use lib "$FindBin::RealBin/lib";

require Directory::Simplify::File;
require Directory::Simplify::FileHash;
require Directory::Simplify::Instruction::Generator;
require Directory::Simplify::Utils;
# :squash-remove-end:

sub HELP_MESSAGE {
    my $fh = shift;
    pod2usage(
        -verbose => 1,
        -exitval => 0,
    );
    return;
}

MAIN: {
    getopts('vfm:M:z', \ my %opts)
        || pod2usage(
            -exitval => 2,
            -msg => "Try '$0 --help' for more information",
        );

    my $verbose = defined $opts{v};
    my $print_freed = defined $opts{f};
    my $files_match = $opts{m} || '';
    my $files_exclude = $opts{M} || '';
    my $include_zero_length_files = defined $opts{z};
    my $print_progress = $verbose;

    # correct relative paths
    # OR if no directories given, search the current directory
    my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));

    my @files;
    if ($verbose) {
        print STDERR 'Finding files...';
    }

    find(sub {
        # outright skip directories (don't report skip)
        return if -d $File::Find::name;

        # skip non-existent files and links
        if (! -f $File::Find::name || -l $File::Find::name) {
            return;
        }

        push @files, Directory::Simplify::File->new($File::Find::name);
    }, @dirs_to_process);

    if ($verbose) {
        printf STDERR '%d files found',
            scalar @files;
    }

    # Limit to or exclude file patterns specified by `-m' or `-M', respectively
    #
    # Truth table:
    # -m matches    | -M is used & matches  | !return?
    # 0             | 0                     | 0
    # 0             | 1                     | 0
    # 1             | 0                     | 1
    # 1             | 1                     | 0
    # note: m// will match everything
    my $file_ct_before_filter = scalar @files;
    @files = grep {
        $_->{rel_name} =~ $files_match
    } @files;
    if ($verbose && $file_ct_before_filter != scalar @files) {
        printf STDERR ' (%d files filtered by -m rule)',
            $file_ct_before_filter - scalar @files;
    }
    if (length $files_exclude) {
        $file_ct_before_filter = scalar @files;
        @files = grep {
            not $_->{rel_name} =~ $files_exclude
        } @files;
        if ($verbose && $file_ct_before_filter != scalar @files) {
            printf STDERR ' (%d files filtered by -M rule)',
                $file_ct_before_filter - scalar @files;
        }
    }

    # Shortcut: Only generate hashes and inspect files that do not have a
    # unique size. The reasoning being that file sizes do not match, there's no
    # possible way those two files can have the same contents.
    my %file_sizes;
    foreach my $file (@files) {
        ++$file_sizes{$file->{size}};
    }
    @files = grep {
        $file_sizes{$_->{size}} > 1
    } @files;

    if ($verbose) {
        printf STDERR " (%d candidates).\n",
            scalar @files;
    }

    unless (@files) {
        printf STDERR "Nothing to do.\n";
        exit 0;
    }

    if ($verbose) {
        print STDERR 'Generating hashes...';
    }
    my $filehash = Directory::Simplify::FileHash->new;
    my $report_every = 1; # seconds
    my $processed_bytes = 0;
    my $last_report = time;
    my $total_size_hr = sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size(sum(map { $_->{size} } @files) or 0);
    my $cb;
    if ($print_progress) {
        printf STDERR "\e\x{37}";
        $cb = sub {
            my ($file, $now) = (shift, time);
            $processed_bytes += $file->{size};
            if ($now >= $last_report + $report_every) {
                printf STDERR "\e\x{38}%8s / %8s",
                    (sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)),
                    $total_size_hr;
                $last_report = $now;
            }
        };
    }
    $filehash->add(
        files => \@files,
        callback => $cb,
    );
    if ($verbose) {
        print STDERR "done.\n";
    }

    my $generator = Directory::Simplify::Instruction::Generator->new(
        filehash => $filehash,
        min_size => ($include_zero_length_files ? 0 : 1),
    );

    my $freed_bytes = 0;

    foreach my $inst ($generator->instructions) {
        if ($verbose) {
            print STDERR $inst, "\n";
        }
        $inst->run;
        $freed_bytes += $inst->bytes_freed;
    }

    if ($print_freed or $verbose) {
        printf STDERR "freed %s bytes (%0.4G %s)\n",
            Directory::Simplify::Utils::addcommas($freed_bytes),
            Directory::Simplify::Utils::hr_size($freed_bytes);
    }
}