mirror of
https://codeberg.org/h3xx/simplify_static_dir
synced 2024-08-14 23:57:24 +00:00
266 lines
7.1 KiB
Perl
Executable file
266 lines
7.1 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
package main;
|
|
use strict;
|
|
use warnings;
|
|
use List::Util qw/ sum /;
|
|
|
|
our $VERSION = '3.1.0';
|
|
|
|
=pod
|
|
|
|
=head1 NAME
|
|
|
|
simplify_static_dir - optimize directories for size by combining identical
|
|
files
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]...
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
The more files this script can process at once, the better. It maintains an
|
|
internal hash of files indexed by their SHA1 checksum. When it finds a
|
|
collision it removes the file with the least amount of file system links, then
|
|
creates a hard link to the other in its place. The larger the number of files
|
|
scanned, the more likely it is that there will be collisions.
|
|
|
|
There are failsafes in place, though. If somehow two files' SHA1 checksums are
|
|
identical, but the file sizes do not match, this program will error out (and
|
|
you can go ahead and report the collision; it may be worth money).
|
|
|
|
There are other features built in as well. Following the logic that unique
|
|
data, once created has the attribute of being unique of that point in time,
|
|
this script will copy the timestamp of the older file onto the newer file just
|
|
before it makes the hard link. Therefore, many times, simplified directories
|
|
will have the appearance of being older than they actually are.
|
|
|
|
From the perspective of any program reading files from the simplified
|
|
directories, the files will lookB<*> and behave the same way as the initial
|
|
state.
|
|
|
|
B<*> Except for having an increased number of hard links.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over
|
|
|
|
=item B<-v>
|
|
|
|
Verbose output.
|
|
|
|
=item B<-f>
|
|
|
|
Print a sum of the number of freed bytes.
|
|
|
|
=item B<-m> I<REGEX>
|
|
|
|
Only match file paths matching I<REGEX>.
|
|
|
|
=item B<-M> I<REGEX>
|
|
|
|
Exclude file paths matching I<REGEX>.
|
|
|
|
=item B<-z>
|
|
|
|
Include zero-length files in search. Normally they are ignored (you don't save
|
|
diskspace by hard-linking empty files).
|
|
|
|
=item B<--help>
|
|
|
|
Output this help message and exit.
|
|
|
|
=item B<--version>
|
|
|
|
Output version information and exit.
|
|
|
|
=back
|
|
|
|
By default, scans the current directory.
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright (C) 2010-2022 Dan Church.
|
|
|
|
License GPLv3: GNU GPL version 3.0 (L<https://www.gnu.org/licenses/gpl-3.0.html>)
|
|
|
|
with Commons Clause 1.0 (L<https://commonsclause.com/>).
|
|
|
|
This is free software: you are free to change and redistribute it.
|
|
|
|
There is NO WARRANTY, to the extent permitted by law.
|
|
|
|
You may NOT use this software for commercial purposes.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
|
|
|
|
=cut
|
|
|
|
use File::Find qw/ find /;
|
|
use Getopt::Std qw/ getopts /;
|
|
use Pod::Usage qw/ pod2usage /;
|
|
|
|
# :squash-remove-start:
|
|
# (this prepends to the load path)
|
|
use FindBin qw//;
|
|
use lib "$FindBin::RealBin/lib";
|
|
|
|
require Directory::Simplify::File;
|
|
require Directory::Simplify::FileHash;
|
|
require Directory::Simplify::Instruction::Generator;
|
|
require Directory::Simplify::Utils;
|
|
# :squash-remove-end:
|
|
|
|
sub HELP_MESSAGE {
|
|
my $fh = shift;
|
|
pod2usage(
|
|
-verbose => 1,
|
|
-exitval => 0,
|
|
);
|
|
return;
|
|
}
|
|
|
|
MAIN: {
|
|
getopts('vfm:M:z', \ my %opts)
|
|
|| pod2usage(
|
|
-exitval => 2,
|
|
-msg => "Try '$0 --help' for more information",
|
|
);
|
|
|
|
my $verbose = defined $opts{v};
|
|
my $print_freed = defined $opts{f};
|
|
my $files_match = $opts{m} || '';
|
|
my $files_exclude = $opts{M} || '';
|
|
my $include_zero_length_files = defined $opts{z};
|
|
my $print_progress = $verbose;
|
|
|
|
# correct relative paths
|
|
# OR if no directories given, search the current directory
|
|
my @dirs_to_process = map { Cwd::abs_path($_) } (@ARGV ? @ARGV : ($ENV{PWD}));
|
|
|
|
my @files;
|
|
if ($verbose) {
|
|
print STDERR 'Finding files...';
|
|
}
|
|
|
|
find(sub {
|
|
# outright skip directories (don't report skip)
|
|
return if -d $File::Find::name;
|
|
|
|
# skip non-existent files and links
|
|
if (! -f $File::Find::name || -l $File::Find::name) {
|
|
return;
|
|
}
|
|
|
|
push @files, Directory::Simplify::File->new($File::Find::name);
|
|
}, @dirs_to_process);
|
|
|
|
if ($verbose) {
|
|
printf STDERR '%d files found',
|
|
scalar @files;
|
|
}
|
|
|
|
# Limit to or exclude file patterns specified by `-m' or `-M', respectively
|
|
#
|
|
# Truth table:
|
|
# -m matches | -M is used & matches | !return?
|
|
# 0 | 0 | 0
|
|
# 0 | 1 | 0
|
|
# 1 | 0 | 1
|
|
# 1 | 1 | 0
|
|
# note: m// will match everything
|
|
my $file_ct_before_filter = scalar @files;
|
|
@files = grep {
|
|
$_->{rel_name} =~ $files_match
|
|
} @files;
|
|
if ($verbose && $file_ct_before_filter != scalar @files) {
|
|
printf STDERR ' (%d files filtered by -m rule)',
|
|
$file_ct_before_filter - scalar @files;
|
|
}
|
|
if (length $files_exclude) {
|
|
$file_ct_before_filter = scalar @files;
|
|
@files = grep {
|
|
not $_->{rel_name} =~ $files_exclude
|
|
} @files;
|
|
if ($verbose && $file_ct_before_filter != scalar @files) {
|
|
printf STDERR ' (%d files filtered by -M rule)',
|
|
$file_ct_before_filter - scalar @files;
|
|
}
|
|
}
|
|
|
|
# Shortcut: Only generate hashes and inspect files that do not have a
|
|
# unique size. The reasoning being that file sizes do not match, there's no
|
|
# possible way those two files can have the same contents.
|
|
my %file_sizes;
|
|
foreach my $file (@files) {
|
|
++$file_sizes{$file->{size}};
|
|
}
|
|
@files = grep {
|
|
$file_sizes{$_->{size}} > 1
|
|
} @files;
|
|
|
|
if ($verbose) {
|
|
printf STDERR " (%d candidates).\n",
|
|
scalar @files;
|
|
}
|
|
|
|
unless (@files) {
|
|
printf STDERR "Nothing to do.\n";
|
|
exit 0;
|
|
}
|
|
|
|
if ($verbose) {
|
|
print STDERR 'Generating hashes...';
|
|
}
|
|
my $filehash = Directory::Simplify::FileHash->new;
|
|
my $report_every = 1; # seconds
|
|
my $processed_bytes = 0;
|
|
my $last_report = time;
|
|
my $total_size_hr = sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size(sum(map { $_->{size} } @files) or 0);
|
|
my $cb;
|
|
if ($print_progress) {
|
|
printf STDERR "\e\x{37}";
|
|
$cb = sub {
|
|
my ($file, $now) = (shift, time);
|
|
$processed_bytes += $file->{size};
|
|
if ($now >= $last_report + $report_every) {
|
|
printf STDERR "\e\x{38}%8s / %8s",
|
|
(sprintf '%0.4G %s', Directory::Simplify::Utils::hr_size($processed_bytes)),
|
|
$total_size_hr;
|
|
$last_report = $now;
|
|
}
|
|
};
|
|
}
|
|
$filehash->add(
|
|
files => \@files,
|
|
callback => $cb,
|
|
);
|
|
if ($verbose) {
|
|
print STDERR "done.\n";
|
|
}
|
|
|
|
my $generator = Directory::Simplify::Instruction::Generator->new(
|
|
filehash => $filehash,
|
|
min_size => ($include_zero_length_files ? 0 : 1),
|
|
);
|
|
|
|
my $freed_bytes = 0;
|
|
|
|
foreach my $inst ($generator->instructions) {
|
|
if ($verbose) {
|
|
print STDERR $inst, "\n";
|
|
}
|
|
$inst->run;
|
|
$freed_bytes += $inst->bytes_freed;
|
|
}
|
|
|
|
if ($print_freed or $verbose) {
|
|
printf STDERR "freed %s bytes (%0.4G %s)\n",
|
|
Directory::Simplify::Utils::addcommas($freed_bytes),
|
|
Directory::Simplify::Utils::hr_size($freed_bytes);
|
|
}
|
|
}
|
|
|