2015-04-02 15:28:22 +00:00
|
|
|
#!/usr/bin/perl
|
|
|
|
# vi: et sts=4 sw=4 ts=4
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
our $VERSION = '2.0.0';
|
2015-04-02 15:28:22 +00:00
|
|
|
|
|
|
|
=pod
|
|
|
|
|
|
|
|
=head1 NAME
|
|
|
|
|
|
|
|
simplify_static_dir - optimize directories for size by combining identical
|
|
|
|
files
|
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
|
|
|
|
B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]...
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
The more files this script can process at once, the better. It maintains an
|
|
|
|
internal hash of files indexed by their SHA1 checksum. When it finds a
|
|
|
|
collision it removes the file with the least amount of file system links, then
|
|
|
|
creates a hard link to the other in its place. The larger the number of files
|
|
|
|
scanned, the more likely it is that there will be collisions.
|
|
|
|
|
|
|
|
There are failsafes in place, though. If somehow two files' SHA1 checksums are
|
|
|
|
identical, but the file sizes do not match, this program will error out (and
|
|
|
|
you can go ahead and report the collision; it may be worth money).
|
|
|
|
|
|
|
|
There are other features built in as well. Following the logic that unique
|
|
|
|
data, once created has the attribute of being unique of that point in time,
|
|
|
|
this script will copy the timestamp of the older file onto the newer file just
|
|
|
|
before it makes the hard link. Therefore, many times, simplified directories
|
|
|
|
will have the appearance of being older than they actually are.
|
|
|
|
|
|
|
|
From the perspective of any program reading files from the simplified
|
|
|
|
directories, the files will lookB<*> and behave the same way as the initial
|
|
|
|
state.
|
|
|
|
|
|
|
|
B<*> Except for having an increased number of hard links.
|
|
|
|
|
|
|
|
=head1 OPTIONS
|
|
|
|
|
|
|
|
=over
|
|
|
|
|
|
|
|
=item B<-v>
|
|
|
|
|
|
|
|
Verbose output.
|
|
|
|
|
|
|
|
=item B<-f>
|
|
|
|
|
|
|
|
Print a sum of the number of freed bytes.
|
|
|
|
|
|
|
|
=item B<-m> I<REGEX>
|
|
|
|
|
|
|
|
Only match file paths matching I<REGEX>.
|
|
|
|
|
|
|
|
=item B<-M> I<REGEX>
|
|
|
|
|
|
|
|
Exclude file paths matching I<REGEX>.
|
|
|
|
|
|
|
|
=item B<-z>
|
|
|
|
|
|
|
|
Include zero-length files in search. Normally they are ignored (you don't save
|
|
|
|
diskspace by hard-linking empty files).
|
|
|
|
|
|
|
|
=item B<--help>
|
|
|
|
|
|
|
|
Output this help message and exit.
|
|
|
|
|
|
|
|
=item B<--version>
|
|
|
|
|
|
|
|
Output version information and exit.
|
|
|
|
|
|
|
|
=back
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
By default, scans the current directory.
|
2015-04-02 15:28:22 +00:00
|
|
|
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
|
2018-01-21 18:30:35 +00:00
|
|
|
Copyright (C) 2010-2018 Dan Church.
|
2015-04-02 15:28:22 +00:00
|
|
|
|
|
|
|
License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>).
|
|
|
|
|
|
|
|
This is free software: you are free to change and redistribute it.
|
|
|
|
|
|
|
|
There is NO WARRANTY, to the extent permitted by law.
|
|
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
|
|
|
|
Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
use File::Find qw/ find /;
|
|
|
|
require Digest::SHA;
|
|
|
|
use Getopt::Std qw/ getopts /;
|
|
|
|
|
|
|
|
sub HELP_MESSAGE {
|
2018-01-21 20:57:44 +00:00
|
|
|
my $fh = shift;
|
|
|
|
print $fh <<EOF
|
|
|
|
Usage: $0 [DIRS]
|
|
|
|
Simplify a directory by hard-linking identical files.
|
|
|
|
|
|
|
|
-v Verbose output.
|
|
|
|
-f Print a sum of the number of freed bytes.
|
|
|
|
-m REGEX Only match file paths matching REGEX.
|
|
|
|
-M REGEX Exclude file paths matching REGEX.
|
|
|
|
-z Include zero-length files in search.
|
|
|
|
|
|
|
|
By default, scans the current directory.
|
|
|
|
|
|
|
|
See also `perldoc $0'
|
|
|
|
EOF
|
|
|
|
;
|
2015-04-02 15:28:22 +00:00
|
|
|
exit 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
my %opts = (
|
|
|
|
v => 0,
|
|
|
|
f => 0,
|
|
|
|
m => '',
|
|
|
|
M => '',
|
|
|
|
z => 0,
|
|
|
|
);
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
&getopts('vfm:M:z', \%opts);
|
2015-04-02 15:28:22 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
my $filehash = new Directory::Simplify::FileHash;
|
|
|
|
|
|
|
|
# include zero-length files if wanted (`-z')
|
|
|
|
$filehash->min_linkable_size(0)
|
|
|
|
if $opts{z};
|
2015-04-02 15:28:22 +00:00
|
|
|
|
|
|
|
# correct relative paths
|
|
|
|
# OR if no directories given, search the current directory
|
|
|
|
push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV;
|
|
|
|
|
|
|
|
my $freed_bytes = 0;
|
|
|
|
|
|
|
|
&find(\&findexec, @ARGV);
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
printf STDERR "freed %d bytes (%0.4G %s)\n",
|
|
|
|
$freed_bytes, &hr_size($freed_bytes)
|
|
|
|
if $opts{f} or $opts{v};
|
|
|
|
|
2015-04-02 15:28:22 +00:00
|
|
|
sub findexec {
|
|
|
|
# outright skip directories (don't report skip)
|
|
|
|
return if -d $File::Find::name;
|
|
|
|
|
|
|
|
# limit to or exclude file patterns specified by `-m' or `-M',
|
|
|
|
# respectively
|
|
|
|
|
|
|
|
# truth table
|
|
|
|
# -m matches | -M is used & matches | !return?
|
|
|
|
# 0 | 0 | 0
|
|
|
|
# 0 | 1 | 0
|
|
|
|
# 1 | 0 | 1
|
|
|
|
# 1 | 1 | 0
|
|
|
|
# note: m// will match everything
|
|
|
|
unless ($File::Find::name =~ m/$opts{m}/ and
|
|
|
|
!(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) {
|
|
|
|
|
|
|
|
print STDERR "Skipping path `$File::Find::name'\n"
|
|
|
|
if $opts{v};
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-01-21 18:30:35 +00:00
|
|
|
# skip non-existent files and links
|
|
|
|
unless (-f $File::Find::name && ! -l $File::Find::name) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
my $entry = $filehash->make_entry($File::Find::name);
|
|
|
|
|
|
|
|
my @linkable = $filehash->find_hardlinkable($entry);
|
|
|
|
if (@linkable) {
|
|
|
|
&hardlink_entries($entry, @linkable);
|
|
|
|
}
|
|
|
|
$filehash->add_entry($entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
sub hardlink_entries {
|
|
|
|
my ($entry, @linkable) = @_;
|
|
|
|
|
|
|
|
# only one of the linkable entries should suffice
|
|
|
|
my $linking_with = $linkable[0];
|
|
|
|
|
|
|
|
# calculate the timestamp of the resulting file
|
|
|
|
my ($atime, $mtime) = @{(
|
|
|
|
$filehash->oldest_mtime($entry, $linking_with)
|
|
|
|
)[0]}{qw/ atime mtime /};
|
|
|
|
|
|
|
|
# find the file less embedded in the file system
|
|
|
|
my ($less_linked, $more_linked) = $filehash->less_linked($entry, $linking_with);
|
|
|
|
|
|
|
|
printf STDERR "removing file `%s'\n", $less_linked->{name}
|
|
|
|
if $opts{v};
|
|
|
|
|
|
|
|
unless (unlink $less_linked->{name}) {
|
|
|
|
printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n",
|
|
|
|
$less_linked->{name},
|
|
|
|
$!,
|
|
|
|
$more_linked->{name};
|
|
|
|
|
|
|
|
# if we can't do this, there's no point in continuing
|
|
|
|
unless (unlink $more_linked->{name}) {
|
|
|
|
printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n",
|
|
|
|
$more_linked->{name},
|
|
|
|
$!;
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
# the ol' switcheroo
|
|
|
|
($more_linked, $less_linked) =
|
|
|
|
($less_linked, $more_linked);
|
|
|
|
}
|
|
|
|
|
|
|
|
# we unlinked it or failed out
|
|
|
|
$freed_bytes += $less_linked->{size}
|
|
|
|
unless $less_linked->{nlink} > 1;
|
|
|
|
|
|
|
|
printf STDERR "hard linking `%s' => `%s'\n",
|
|
|
|
$less_linked->{name}, $more_linked->{name}
|
|
|
|
if $opts{v};
|
|
|
|
|
|
|
|
# hard link the files
|
|
|
|
link $more_linked->{name},
|
|
|
|
$less_linked->{name};
|
|
|
|
|
|
|
|
# preserve older time stamp
|
|
|
|
utime $atime, $mtime, $more_linked->{name};
|
|
|
|
$more_linked->{atime} = $atime;
|
|
|
|
$more_linked->{mtime} = $mtime;
|
|
|
|
|
|
|
|
# update link count in our hash to reflect the file system (referenced)
|
|
|
|
++$more_linked->{nlink};
|
|
|
|
|
|
|
|
# update old entry with the info from the new one
|
|
|
|
foreach my $copy_attr (qw/
|
|
|
|
ino
|
|
|
|
nlink
|
|
|
|
mode
|
|
|
|
uid
|
|
|
|
gid
|
|
|
|
atime
|
|
|
|
mtime
|
|
|
|
ctime
|
|
|
|
/) {
|
|
|
|
$less_linked->{$copy_attr} = $more_linked->{$copy_attr};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub hr_size {
|
|
|
|
my $sz = shift;
|
|
|
|
my @sizes = qw/ B KB MB GB TB PB EB ZB YB /;
|
|
|
|
my $fact = 1024;
|
|
|
|
my $thresh = 0.1;
|
|
|
|
my @ret;
|
|
|
|
foreach my $exp (reverse 0 .. $#sizes) {
|
|
|
|
if ($sz > (1 - $thresh) * $fact ** $exp) {
|
|
|
|
@ret = ($sz / $fact ** $exp, $sizes[$exp]);
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# default to ($sz, 'bytes')
|
|
|
|
@ret = ($sz, $sizes[0]) unless @ret;
|
|
|
|
|
|
|
|
wantarray ? @ret : "@ret"
|
|
|
|
}
|
|
|
|
|
|
|
|
package Directory::Simplify::FileHash;
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
|
|
|
|
Object for abstracting management of a hashed filesystem
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub new {
|
|
|
|
my ($class, $self) = (shift, {});
|
|
|
|
|
|
|
|
$self->{_files} = {};
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
require Digest::SHA;
|
|
|
|
$self->{_ctx} = Digest::SHA->new;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
# default options
|
|
|
|
$self->{_min_linkable_size} = 1;
|
|
|
|
|
|
|
|
bless $self, $class
|
|
|
|
}
|
|
|
|
|
|
|
|
=head2 min_linkable_size($bytes)
|
|
|
|
|
|
|
|
Set or get the minimum size of files to be considered hard-linkable. Default is 1.
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub min_linkable_size {
|
|
|
|
my $self = shift;
|
|
|
|
my $in = shift;
|
|
|
|
if (defined $in) {
|
|
|
|
$self->{_min_linkable_size} = $in;
|
|
|
|
}
|
|
|
|
$self->{_min_linkable_size}
|
|
|
|
}
|
|
|
|
|
|
|
|
=head2 make_entry($filename)
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub make_entry {
|
|
|
|
my $self = shift;
|
|
|
|
my ($filename) = @_;
|
2018-01-21 18:30:35 +00:00
|
|
|
# organize results from lstat into hash
|
|
|
|
my $entry = {};
|
|
|
|
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
|
|
|
|
atime mtime ctime blksize blocks /})
|
2018-01-21 20:57:44 +00:00
|
|
|
= ($filename, lstat $filename);
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
$entry->{hash} = $self->_hash($filename);
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
$entry
|
|
|
|
}
|
|
|
|
|
|
|
|
=head2 add_entry($entry)
|
|
|
|
|
|
|
|
=cut
|
|
|
|
|
|
|
|
sub add_entry {
|
|
|
|
my $self = shift;
|
|
|
|
my ($entry) = @_;
|
|
|
|
|
|
|
|
my $hash = $entry->{hash};
|
|
|
|
|
|
|
|
unless (defined $self->{_files}->{$hash}) {
|
|
|
|
$self->{_files}->{$hash} = [];
|
2018-01-21 18:30:35 +00:00
|
|
|
}
|
2018-01-21 20:57:44 +00:00
|
|
|
push @{$self->{_files}->{$hash}}, $entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub find_hardlinkable {
|
|
|
|
my $self = shift;
|
|
|
|
my ($entry) = @_;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
my $hash = $entry->{hash};
|
|
|
|
|
|
|
|
# no matching entries
|
|
|
|
unless (defined $self->{_files}->{$hash}) {
|
|
|
|
return ();
|
2018-01-21 18:30:35 +00:00
|
|
|
}
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
my @matches;
|
|
|
|
foreach my $ent (@{$self->{_files}->{$hash}}) {
|
|
|
|
if ($self->_entries_are_hard_linkable($entry, $ent)) {
|
|
|
|
push @matches, $ent;
|
|
|
|
}
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
return @matches;
|
2015-04-02 15:28:22 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
=head2 oldest($entry_a, $entry_b, ...)
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
Find the file less embedded in the file system.
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
=cut
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub less_linked {
|
|
|
|
my $self = shift;
|
|
|
|
return sort
|
|
|
|
{$a->{nlink} <=> $b->{nlink}}
|
|
|
|
@_;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
=head2 oldest($entry_a, $entry_b, ...)
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
Find the entry with the oldest time stamp.
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
=cut
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub oldest_mtime {
|
|
|
|
my $self = shift;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
return sort
|
|
|
|
{$a->{mtime} <=> $b->{mtime}}
|
|
|
|
@_;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub _hash {
|
|
|
|
my $self = shift;
|
|
|
|
my ($filename) = @_;
|
|
|
|
$self->{_ctx}->addfile($filename);
|
|
|
|
return $self->{_ctx}->hexdigest;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub _entries_are_hard_linkable {
|
|
|
|
my $self = shift;
|
|
|
|
my ($entry_a, $entry_b) = @_;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
# obviously, if the sizes aren't the same, they're not the same file
|
|
|
|
unless (&_entries_sizes_match($entry_a, $entry_b)) {
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
# too small to be hard-linked
|
|
|
|
if ($entry_a->{size} < $self->min_linkable_size) {
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
# they're the same file, don't try it
|
|
|
|
if (&_entries_are_same_filename($entry_a, $entry_b)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (&_entries_are_already_hard_linked($entry_a, $entry_b)) {
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
if (&_entries_contents_match($entry_a, $entry_b)) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub _entries_are_same_filename {
|
|
|
|
my ($entry_a, $entry_b) = @_;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
if ($entry_a->{name} eq $entry_b->{name}) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub _entries_are_already_hard_linked {
|
|
|
|
my ($entry_a, $entry_b) = @_;
|
2018-01-21 18:30:35 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
if ($entry_a->{ino} == $entry_b->{ino} and
|
|
|
|
$entry_a->{dev} == $entry_b->{dev}) {
|
|
|
|
return 1;
|
2018-01-21 18:30:35 +00:00
|
|
|
}
|
2018-01-21 20:57:44 +00:00
|
|
|
|
|
|
|
return 0;
|
2015-04-02 15:28:22 +00:00
|
|
|
}
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub _entries_sizes_match {
|
|
|
|
my ($entry_a, $entry_b) = @_;
|
|
|
|
if ($entry_a->{size} != $entry_b->{size}) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2015-04-02 15:28:22 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
sub _entries_contents_match {
|
|
|
|
my ($entry_a, $entry_b) = @_;
|
|
|
|
|
|
|
|
# also, if the hashes aren't the same, they cannot be the same file
|
|
|
|
if ($entry_a->{hash} ne $entry_b->{hash}) {
|
|
|
|
return 0;
|
2015-04-02 15:28:22 +00:00
|
|
|
}
|
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
use File::Compare qw/ compare /;
|
|
|
|
my $contents_same = (0 == &compare($entry_a->{name}, $entry_b->{name}));
|
2015-04-02 15:28:22 +00:00
|
|
|
|
2018-01-21 20:57:44 +00:00
|
|
|
# warn about hash collision
|
|
|
|
unless ($contents_same) {
|
|
|
|
warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n";
|
|
|
|
}
|
|
|
|
return $contents_same;
|
2015-04-02 15:28:22 +00:00
|
|
|
}
|