Clean up script

This commit is contained in:
Dan Church 2018-01-21 12:30:35 -06:00
parent 801bed9fa9
commit a10996e70f
Signed by: h3xx
GPG Key ID: EA2BF379CD2CDBD0
1 changed files with 143 additions and 155 deletions

View File

@ -3,7 +3,7 @@
use strict; use strict;
use warnings; use warnings;
our $VERSION = '1.2.2'; our $VERSION = '1.2.3';
=pod =pod
@ -115,7 +115,7 @@ Fixed bug when processing files with \r characters in the name.
=head1 COPYRIGHT =head1 COPYRIGHT
Copyright (C) 2010-2013 Dan Church. Copyright (C) 2010-2018 Dan Church.
License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>). License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>).
@ -132,7 +132,6 @@ Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
use File::Find qw/ find /; use File::Find qw/ find /;
require Digest::SHA; require Digest::SHA;
use Getopt::Std qw/ getopts /; use Getopt::Std qw/ getopts /;
require Pod::Text;
sub HELP_MESSAGE { sub HELP_MESSAGE {
# my $fh = shift; # my $fh = shift;
@ -152,6 +151,7 @@ sub HELP_MESSAGE {
#symlinked by default. #symlinked by default.
#EOF #EOF
#; #;
require Pod::Text;
my ($fh, $pod) = (shift, Pod::Text->new); my ($fh, $pod) = (shift, Pod::Text->new);
$pod->parse_from_file($0, $fh); $pod->parse_from_file($0, $fh);
@ -202,167 +202,156 @@ sub findexec {
return; return;
} }
# make sure the file exists and it's not a link # skip non-existent files and links
if (-f $File::Find::name && ! -l $File::Find::name) { unless (-f $File::Find::name && ! -l $File::Find::name) {
#my $ctx = Digest::MD5->new; return;
my $ctx = Digest::SHA->new; }
$ctx->addfile($File::Find::name);
# save the hex digest because reading the value from #my $ctx = Digest::MD5->new;
# Digest::* destroys it my $ctx = Digest::SHA->new;
my $digest = $ctx->hexdigest; $ctx->addfile($File::Find::name);
# organize results from lstat into hash # save the hex digest because reading the value from
my $entry = {}; # Digest::* destroys it
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size my $digest = $ctx->hexdigest;
atime mtime ctime blksize blocks /})
= ($File::Find::name, lstat $File::Find::name);
# skip zero-length files if wanted (`-z') # organize results from lstat into hash
# truth table: my $entry = {};
# -z | non-zero length | return? (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
# 0 | 0 | 1 atime mtime ctime blksize blocks /})
# 0 | 1 | 0 = ($File::Find::name, lstat $File::Find::name);
# 1 | 0 | 0
# 1 | 1 | 0
return unless $opts{z} or $entry->{size};
# check to see if we've come across a file with the same crc # skip zero-length files if wanted (`-z')
if (exists $files{$digest}) { return unless $opts{z} or $entry->{size};
my $curr_entry = $files{$digest};
# don't waste my time # check to see if we've come across a file with the same checksum
return if $curr_entry->{name} eq $entry->{name} or unless (exists $files{$digest}) {
$curr_entry->{ino} == $entry->{ino}; # the file is unique (as far as we know)
# create a new entry in the hash table
$files{$digest} = $entry;
return;
}
# identical files should be the same size (I'm paranoid my $curr_entry = $files{$digest};
# of checksumming procedures); if it's not, there's a
# problem with the checksumming procedure or this # don't waste my time
# script is processing WAY too many files return if $curr_entry->{name} eq $entry->{name} or
# (addendum: perhaps I should congratulate the user on $curr_entry->{ino} == $entry->{ino};
# finding a collision in SHA-1)
if ($curr_entry->{size} != $entry->{size}) { # identical files should be the same size (I'm paranoid
die "ERROR: checksums identical for two non-identical files!!!:\n". # of checksumming procedures); if it's not, there's a
"files:\t`$curr_entry->{name}'\n". # problem with the checksumming procedure or this
"\t`$entry->{name}'\n". # script is processing WAY too many files
"SHA1: ($digest)\n". # (addendum: perhaps I should congratulate the user on
'(this is probably a limit of SHA1; try processing fewer files)'; # finding a collision in SHA-1)
if ($curr_entry->{size} != $entry->{size}) {
die "ERROR: checksums identical for two non-identical files!!!:\n".
"files:\t`$curr_entry->{name}'\n".
"\t`$entry->{name}'\n".
"SHA1: ($digest)\n".
'(this is probably a limit of SHA1; try processing fewer files)';
}
# find the oldest time stamp
my ($atime, $mtime) = @{(sort
{$a->{mtime} <=> $b->{mtime}}
($entry, $curr_entry)
)[0]}{qw/ atime mtime /};
# find the file less embedded in the file system
my ($less_linked, $more_linked) = sort
{$a->{nlink} <=> $b->{nlink}}
($entry, $curr_entry);
# hard-linkable files must exist on the same device and
# must not already be hard-linked
if ($curr_entry->{dev} == $entry->{dev} &&
! $opts{s}) {
# attempt to unlink the file
printf STDERR "removing file `%s'\n",
$less_linked->{name} if $opts{v};
unless (unlink $less_linked->{name}) {
# couldn't do it; try more-linked file
printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n",
$less_linked->{name},
$!,
$more_linked->{name}
if $opts{v};
# if we can't do this, there's no point
# in continuing
unless (unlink $more_linked->{name}) {
printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n",
$more_linked->{name},
$!
if $opts{v};
return;
} }
# find the oldest time stamp # the ol' switcheroo
my ($atime, $mtime) = @{(sort ($more_linked, $less_linked) =
{$a->{mtime} <=> $b->{mtime}} ($less_linked, $more_linked);
($entry, $curr_entry)
)[0]}{qw/ atime mtime /};
# find the file less embedded in the file system
my ($less_linked, $more_linked) = sort
{$a->{nlink} <=> $b->{nlink}}
($entry, $curr_entry);
# hard-linkable files must exist on the same device and
# must not already be hard-linked
if ($curr_entry->{dev} == $entry->{dev} &&
! $opts{s}) {
# print "hard-linking $new_file\t=>$old_file\n";
# attempt to unlink the file
printf STDERR "removing file `%s'\n",
$less_linked->{name} if $opts{v};
unless (unlink $less_linked->{name}) {
# couldn't do it; try more-linked file
printf STDERR <<EOF
Failed to remove file `%s': %s
(using `%s')
EOF
,
$less_linked->{name},
$!,
$more_linked->{name}
if $opts{v};
# if we can't do this, there's no point
# in continuing
unless (unlink $more_linked->{name}) {
printf STDERR <<EOF
Failed to remove file `%s' (second failure on match): %s
Skipping...
EOF
,
$more_linked->{name},
$!
if $opts{v};
return;
}
# the ol' switcheroo
($more_linked, $less_linked) =
($less_linked, $more_linked);
}
# we unlinked it or failed out
$freed_bytes += $less_linked->{size}
unless $less_linked->{nlink} > 1;
printf STDERR "hard linking `%s' => `%s'\n",
$less_linked->{name}, $more_linked->{name}
if $opts{v};
# hard link the files
link $more_linked->{name},
$less_linked->{name};
# update link count in our hash to reflect the
# file system (referenced)
++$more_linked->{nlink};
# preserve older time stamp
utime $atime, $mtime, $less_linked->{name};
} elsif (! $opts{S}) {
# files are on different drives;
# most that can be done is to soft-link them
unless (unlink $less_linked->{name}) {
# couldn't do it; try more-linked file
printf STDERR "couldn't remove file `%s' (using `%s')\n",
$less_linked->{name},
$more_linked->{name} if $opts{v};
# if we can't do this, there's no point
# in continuing
unlink $more_linked->{name}
or return;
# the ol' switcheroo
($more_linked, $less_linked) =
($less_linked, $more_linked);
}
# we unlinked it or failed out
$freed_bytes += $less_linked->{size};
printf STDERR "soft-linking %s => %s\n",
$less_linked->{name}, $more_linked->{name}
if $opts{v};
# create a soft link (TODO: relative links)
symlink $more_linked->{name},
$less_linked->{name};
# preserve older time stamp
utime $atime, $mtime, $less_linked->{name};
}
} else {
# the file is unique (as far as we know)
# create a new entry in the hash table
$files{$digest} = $entry;
} }
# we unlinked it or failed out
$freed_bytes += $less_linked->{size}
unless $less_linked->{nlink} > 1;
printf STDERR "hard linking `%s' => `%s'\n",
$less_linked->{name}, $more_linked->{name}
if $opts{v};
# hard link the files
link $more_linked->{name},
$less_linked->{name};
# update link count in our hash to reflect the
# file system (referenced)
++$more_linked->{nlink};
# preserve older time stamp
utime $atime, $mtime, $less_linked->{name};
} elsif (! $opts{S}) {
# files are on different drives;
# most that can be done is to soft-link them
unless (unlink $less_linked->{name}) {
# couldn't do it; try more-linked file
printf STDERR "couldn't remove file `%s' (using `%s')\n",
$less_linked->{name},
$more_linked->{name} if $opts{v};
# if we can't do this, there's no point
# in continuing
unlink $more_linked->{name}
or return;
# the ol' switcheroo
($more_linked, $less_linked) =
($less_linked, $more_linked);
}
# we unlinked it or failed out
$freed_bytes += $less_linked->{size};
printf STDERR "soft-linking %s => %s\n",
$less_linked->{name}, $more_linked->{name}
if $opts{v};
# create a soft link (TODO: relative links)
symlink $more_linked->{name},
$less_linked->{name};
# preserve older time stamp
utime $atime, $mtime, $less_linked->{name};
}
#} elsif (-l $File::Find::name) { #} elsif (-l $File::Find::name) {
# # do something to simplify symlinks # # do something to simplify symlinks
# printf STDERR "FIXME: simplifying symlink `%s'\n", # printf STDERR "FIXME: simplifying symlink `%s'\n",
@ -371,7 +360,6 @@ EOF
# printf STDERR "symlink `%s' points to `%s'\n", # printf STDERR "symlink `%s' points to `%s'\n",
# $File::Find::name, readlink $File::Find::name; # $File::Find::name, readlink $File::Find::name;
}
} }
printf STDERR "freed %d bytes (%0.4G %s)\n", printf STDERR "freed %d bytes (%0.4G %s)\n",