Clean up script
This commit is contained in:
parent
801bed9fa9
commit
a10996e70f
|
@ -3,7 +3,7 @@
|
||||||
use strict;
|
use strict;
|
||||||
use warnings;
|
use warnings;
|
||||||
|
|
||||||
our $VERSION = '1.2.2';
|
our $VERSION = '1.2.3';
|
||||||
|
|
||||||
=pod
|
=pod
|
||||||
|
|
||||||
|
@ -115,7 +115,7 @@ Fixed bug when processing files with \r characters in the name.
|
||||||
|
|
||||||
=head1 COPYRIGHT
|
=head1 COPYRIGHT
|
||||||
|
|
||||||
Copyright (C) 2010-2013 Dan Church.
|
Copyright (C) 2010-2018 Dan Church.
|
||||||
|
|
||||||
License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>).
|
License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>).
|
||||||
|
|
||||||
|
@ -132,7 +132,6 @@ Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>>
|
||||||
use File::Find qw/ find /;
|
use File::Find qw/ find /;
|
||||||
require Digest::SHA;
|
require Digest::SHA;
|
||||||
use Getopt::Std qw/ getopts /;
|
use Getopt::Std qw/ getopts /;
|
||||||
require Pod::Text;
|
|
||||||
|
|
||||||
sub HELP_MESSAGE {
|
sub HELP_MESSAGE {
|
||||||
# my $fh = shift;
|
# my $fh = shift;
|
||||||
|
@ -152,6 +151,7 @@ sub HELP_MESSAGE {
|
||||||
#symlinked by default.
|
#symlinked by default.
|
||||||
#EOF
|
#EOF
|
||||||
#;
|
#;
|
||||||
|
require Pod::Text;
|
||||||
my ($fh, $pod) = (shift, Pod::Text->new);
|
my ($fh, $pod) = (shift, Pod::Text->new);
|
||||||
$pod->parse_from_file($0, $fh);
|
$pod->parse_from_file($0, $fh);
|
||||||
|
|
||||||
|
@ -202,167 +202,156 @@ sub findexec {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
# make sure the file exists and it's not a link
|
# skip non-existent files and links
|
||||||
if (-f $File::Find::name && ! -l $File::Find::name) {
|
unless (-f $File::Find::name && ! -l $File::Find::name) {
|
||||||
#my $ctx = Digest::MD5->new;
|
return;
|
||||||
my $ctx = Digest::SHA->new;
|
}
|
||||||
$ctx->addfile($File::Find::name);
|
|
||||||
|
|
||||||
# save the hex digest because reading the value from
|
#my $ctx = Digest::MD5->new;
|
||||||
# Digest::* destroys it
|
my $ctx = Digest::SHA->new;
|
||||||
my $digest = $ctx->hexdigest;
|
$ctx->addfile($File::Find::name);
|
||||||
|
|
||||||
# organize results from lstat into hash
|
# save the hex digest because reading the value from
|
||||||
my $entry = {};
|
# Digest::* destroys it
|
||||||
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
|
my $digest = $ctx->hexdigest;
|
||||||
atime mtime ctime blksize blocks /})
|
|
||||||
= ($File::Find::name, lstat $File::Find::name);
|
|
||||||
|
|
||||||
# skip zero-length files if wanted (`-z')
|
# organize results from lstat into hash
|
||||||
# truth table:
|
my $entry = {};
|
||||||
# -z | non-zero length | return?
|
(@{$entry}{qw/ name dev ino mode nlink uid gid rdev size
|
||||||
# 0 | 0 | 1
|
atime mtime ctime blksize blocks /})
|
||||||
# 0 | 1 | 0
|
= ($File::Find::name, lstat $File::Find::name);
|
||||||
# 1 | 0 | 0
|
|
||||||
# 1 | 1 | 0
|
|
||||||
return unless $opts{z} or $entry->{size};
|
|
||||||
|
|
||||||
# check to see if we've come across a file with the same crc
|
# skip zero-length files if wanted (`-z')
|
||||||
if (exists $files{$digest}) {
|
return unless $opts{z} or $entry->{size};
|
||||||
my $curr_entry = $files{$digest};
|
|
||||||
|
|
||||||
# don't waste my time
|
# check to see if we've come across a file with the same checksum
|
||||||
return if $curr_entry->{name} eq $entry->{name} or
|
unless (exists $files{$digest}) {
|
||||||
$curr_entry->{ino} == $entry->{ino};
|
# the file is unique (as far as we know)
|
||||||
|
# create a new entry in the hash table
|
||||||
|
$files{$digest} = $entry;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
# identical files should be the same size (I'm paranoid
|
my $curr_entry = $files{$digest};
|
||||||
# of checksumming procedures); if it's not, there's a
|
|
||||||
# problem with the checksumming procedure or this
|
# don't waste my time
|
||||||
# script is processing WAY too many files
|
return if $curr_entry->{name} eq $entry->{name} or
|
||||||
# (addendum: perhaps I should congratulate the user on
|
$curr_entry->{ino} == $entry->{ino};
|
||||||
# finding a collision in SHA-1)
|
|
||||||
if ($curr_entry->{size} != $entry->{size}) {
|
# identical files should be the same size (I'm paranoid
|
||||||
die "ERROR: checksums identical for two non-identical files!!!:\n".
|
# of checksumming procedures); if it's not, there's a
|
||||||
"files:\t`$curr_entry->{name}'\n".
|
# problem with the checksumming procedure or this
|
||||||
"\t`$entry->{name}'\n".
|
# script is processing WAY too many files
|
||||||
"SHA1: ($digest)\n".
|
# (addendum: perhaps I should congratulate the user on
|
||||||
'(this is probably a limit of SHA1; try processing fewer files)';
|
# finding a collision in SHA-1)
|
||||||
|
if ($curr_entry->{size} != $entry->{size}) {
|
||||||
|
die "ERROR: checksums identical for two non-identical files!!!:\n".
|
||||||
|
"files:\t`$curr_entry->{name}'\n".
|
||||||
|
"\t`$entry->{name}'\n".
|
||||||
|
"SHA1: ($digest)\n".
|
||||||
|
'(this is probably a limit of SHA1; try processing fewer files)';
|
||||||
|
}
|
||||||
|
|
||||||
|
# find the oldest time stamp
|
||||||
|
my ($atime, $mtime) = @{(sort
|
||||||
|
{$a->{mtime} <=> $b->{mtime}}
|
||||||
|
($entry, $curr_entry)
|
||||||
|
)[0]}{qw/ atime mtime /};
|
||||||
|
|
||||||
|
# find the file less embedded in the file system
|
||||||
|
my ($less_linked, $more_linked) = sort
|
||||||
|
{$a->{nlink} <=> $b->{nlink}}
|
||||||
|
($entry, $curr_entry);
|
||||||
|
|
||||||
|
# hard-linkable files must exist on the same device and
|
||||||
|
# must not already be hard-linked
|
||||||
|
if ($curr_entry->{dev} == $entry->{dev} &&
|
||||||
|
! $opts{s}) {
|
||||||
|
# attempt to unlink the file
|
||||||
|
printf STDERR "removing file `%s'\n",
|
||||||
|
$less_linked->{name} if $opts{v};
|
||||||
|
unless (unlink $less_linked->{name}) {
|
||||||
|
|
||||||
|
# couldn't do it; try more-linked file
|
||||||
|
|
||||||
|
printf STDERR "Failed to remove file `%s': %s\n(using `%s')\n",
|
||||||
|
$less_linked->{name},
|
||||||
|
$!,
|
||||||
|
$more_linked->{name}
|
||||||
|
if $opts{v};
|
||||||
|
|
||||||
|
# if we can't do this, there's no point
|
||||||
|
# in continuing
|
||||||
|
unless (unlink $more_linked->{name}) {
|
||||||
|
printf STDERR "Failed to remove file `%s' (second failure on match): %s\nSkipping...\n",
|
||||||
|
$more_linked->{name},
|
||||||
|
$!
|
||||||
|
if $opts{v};
|
||||||
|
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
# find the oldest time stamp
|
# the ol' switcheroo
|
||||||
my ($atime, $mtime) = @{(sort
|
($more_linked, $less_linked) =
|
||||||
{$a->{mtime} <=> $b->{mtime}}
|
($less_linked, $more_linked);
|
||||||
($entry, $curr_entry)
|
|
||||||
)[0]}{qw/ atime mtime /};
|
|
||||||
|
|
||||||
# find the file less embedded in the file system
|
|
||||||
my ($less_linked, $more_linked) = sort
|
|
||||||
{$a->{nlink} <=> $b->{nlink}}
|
|
||||||
($entry, $curr_entry);
|
|
||||||
|
|
||||||
# hard-linkable files must exist on the same device and
|
|
||||||
# must not already be hard-linked
|
|
||||||
if ($curr_entry->{dev} == $entry->{dev} &&
|
|
||||||
! $opts{s}) {
|
|
||||||
# print "hard-linking $new_file\t=>$old_file\n";
|
|
||||||
# attempt to unlink the file
|
|
||||||
printf STDERR "removing file `%s'\n",
|
|
||||||
$less_linked->{name} if $opts{v};
|
|
||||||
unless (unlink $less_linked->{name}) {
|
|
||||||
|
|
||||||
# couldn't do it; try more-linked file
|
|
||||||
|
|
||||||
printf STDERR <<EOF
|
|
||||||
Failed to remove file `%s': %s
|
|
||||||
(using `%s')
|
|
||||||
EOF
|
|
||||||
,
|
|
||||||
$less_linked->{name},
|
|
||||||
$!,
|
|
||||||
$more_linked->{name}
|
|
||||||
if $opts{v};
|
|
||||||
|
|
||||||
# if we can't do this, there's no point
|
|
||||||
# in continuing
|
|
||||||
unless (unlink $more_linked->{name}) {
|
|
||||||
printf STDERR <<EOF
|
|
||||||
Failed to remove file `%s' (second failure on match): %s
|
|
||||||
Skipping...
|
|
||||||
EOF
|
|
||||||
,
|
|
||||||
$more_linked->{name},
|
|
||||||
$!
|
|
||||||
if $opts{v};
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
# the ol' switcheroo
|
|
||||||
($more_linked, $less_linked) =
|
|
||||||
($less_linked, $more_linked);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
# we unlinked it or failed out
|
|
||||||
$freed_bytes += $less_linked->{size}
|
|
||||||
unless $less_linked->{nlink} > 1;
|
|
||||||
|
|
||||||
printf STDERR "hard linking `%s' => `%s'\n",
|
|
||||||
$less_linked->{name}, $more_linked->{name}
|
|
||||||
if $opts{v};
|
|
||||||
|
|
||||||
# hard link the files
|
|
||||||
link $more_linked->{name},
|
|
||||||
$less_linked->{name};
|
|
||||||
|
|
||||||
# update link count in our hash to reflect the
|
|
||||||
# file system (referenced)
|
|
||||||
++$more_linked->{nlink};
|
|
||||||
|
|
||||||
# preserve older time stamp
|
|
||||||
utime $atime, $mtime, $less_linked->{name};
|
|
||||||
} elsif (! $opts{S}) {
|
|
||||||
# files are on different drives;
|
|
||||||
# most that can be done is to soft-link them
|
|
||||||
|
|
||||||
unless (unlink $less_linked->{name}) {
|
|
||||||
|
|
||||||
# couldn't do it; try more-linked file
|
|
||||||
|
|
||||||
printf STDERR "couldn't remove file `%s' (using `%s')\n",
|
|
||||||
$less_linked->{name},
|
|
||||||
$more_linked->{name} if $opts{v};
|
|
||||||
|
|
||||||
# if we can't do this, there's no point
|
|
||||||
# in continuing
|
|
||||||
unlink $more_linked->{name}
|
|
||||||
or return;
|
|
||||||
|
|
||||||
# the ol' switcheroo
|
|
||||||
($more_linked, $less_linked) =
|
|
||||||
($less_linked, $more_linked);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
# we unlinked it or failed out
|
|
||||||
$freed_bytes += $less_linked->{size};
|
|
||||||
|
|
||||||
printf STDERR "soft-linking %s => %s\n",
|
|
||||||
$less_linked->{name}, $more_linked->{name}
|
|
||||||
if $opts{v};
|
|
||||||
|
|
||||||
# create a soft link (TODO: relative links)
|
|
||||||
symlink $more_linked->{name},
|
|
||||||
$less_linked->{name};
|
|
||||||
|
|
||||||
# preserve older time stamp
|
|
||||||
utime $atime, $mtime, $less_linked->{name};
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
# the file is unique (as far as we know)
|
|
||||||
# create a new entry in the hash table
|
|
||||||
$files{$digest} = $entry;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# we unlinked it or failed out
|
||||||
|
$freed_bytes += $less_linked->{size}
|
||||||
|
unless $less_linked->{nlink} > 1;
|
||||||
|
|
||||||
|
printf STDERR "hard linking `%s' => `%s'\n",
|
||||||
|
$less_linked->{name}, $more_linked->{name}
|
||||||
|
if $opts{v};
|
||||||
|
|
||||||
|
# hard link the files
|
||||||
|
link $more_linked->{name},
|
||||||
|
$less_linked->{name};
|
||||||
|
|
||||||
|
# update link count in our hash to reflect the
|
||||||
|
# file system (referenced)
|
||||||
|
++$more_linked->{nlink};
|
||||||
|
|
||||||
|
# preserve older time stamp
|
||||||
|
utime $atime, $mtime, $less_linked->{name};
|
||||||
|
} elsif (! $opts{S}) {
|
||||||
|
# files are on different drives;
|
||||||
|
# most that can be done is to soft-link them
|
||||||
|
|
||||||
|
unless (unlink $less_linked->{name}) {
|
||||||
|
|
||||||
|
# couldn't do it; try more-linked file
|
||||||
|
|
||||||
|
printf STDERR "couldn't remove file `%s' (using `%s')\n",
|
||||||
|
$less_linked->{name},
|
||||||
|
$more_linked->{name} if $opts{v};
|
||||||
|
|
||||||
|
# if we can't do this, there's no point
|
||||||
|
# in continuing
|
||||||
|
unlink $more_linked->{name}
|
||||||
|
or return;
|
||||||
|
|
||||||
|
# the ol' switcheroo
|
||||||
|
($more_linked, $less_linked) =
|
||||||
|
($less_linked, $more_linked);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# we unlinked it or failed out
|
||||||
|
$freed_bytes += $less_linked->{size};
|
||||||
|
|
||||||
|
printf STDERR "soft-linking %s => %s\n",
|
||||||
|
$less_linked->{name}, $more_linked->{name}
|
||||||
|
if $opts{v};
|
||||||
|
|
||||||
|
# create a soft link (TODO: relative links)
|
||||||
|
symlink $more_linked->{name},
|
||||||
|
$less_linked->{name};
|
||||||
|
|
||||||
|
# preserve older time stamp
|
||||||
|
utime $atime, $mtime, $less_linked->{name};
|
||||||
|
}
|
||||||
#} elsif (-l $File::Find::name) {
|
#} elsif (-l $File::Find::name) {
|
||||||
# # do something to simplify symlinks
|
# # do something to simplify symlinks
|
||||||
# printf STDERR "FIXME: simplifying symlink `%s'\n",
|
# printf STDERR "FIXME: simplifying symlink `%s'\n",
|
||||||
|
@ -371,7 +360,6 @@ EOF
|
||||||
|
|
||||||
# printf STDERR "symlink `%s' points to `%s'\n",
|
# printf STDERR "symlink `%s' points to `%s'\n",
|
||||||
# $File::Find::name, readlink $File::Find::name;
|
# $File::Find::name, readlink $File::Find::name;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf STDERR "freed %d bytes (%0.4G %s)\n",
|
printf STDERR "freed %d bytes (%0.4G %s)\n",
|
||||||
|
|
Loading…
Reference in New Issue