package Directory::Simplify::Instruction::Generator; # vi: et sts=4 sw=4 ts=4 use strict; use warnings; use overload '""' => 'as_string'; use File::Basename qw/ dirname /; use File::Compare qw/ compare /; # :squash-ignore-start: require Directory::Simplify::Instruction::CopyTimestamp; require Directory::Simplify::Instruction::Hardlink; # :squash-ignore-end: sub new { my $class = shift; return bless { filehash => undef, min_size => 1, @_, }, $class; } sub as_string { my $self = shift; join "\n", $self->instructions; } sub buckets { my $self = shift; my @candidate_lists = $self->{filehash}->entries; my @buckets; foreach my $candidates (@candidate_lists) { my @ca = @{$candidates}; # make a clone my @these_buckets; # at least two files needed to link together if (@ca > 1) { ELIMINATOR: while (@ca) { my $entry = shift @ca; next ELIMINATOR if $self->_entry_should_be_skipped($entry); foreach my $bucket_idx (0 .. $#these_buckets) { if (&_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { push @{$these_buckets[$bucket_idx]}, $entry; next ELIMINATOR; } } # didn't find a bucket (even though the hash matched!) # -> new bucket push @these_buckets, [$entry]; } } push @buckets, @these_buckets; } @buckets } sub _entry_should_be_skipped { my ($self, $entry_a) = @_; # too small to be hard-linked if ($entry_a->{size} < $self->{min_size}) { return 1; } return 0; } sub _entries_are_hard_linkable { my ($entry_a, $entry_b) = @_; # obviously, if the sizes aren't the same, they're not the same file unless (&_entries_sizes_match($entry_a, $entry_b)) { return 0; } # they're the same file, don't try it if (&_entries_are_already_hard_linked($entry_a, $entry_b)) { return 0; } if (&_entries_contents_match($entry_a, $entry_b)) { return 1; } return 0; } sub oldest_mtime { my $self = shift; return sort { $a->{mtime} <=> $b->{mtime} } @_; } sub more_linked { my $self = shift; my %warned; return sort { if (! -w &dirname($a->{name})) { warn "Warning: $a->{name} not able to be unlinked!" unless $warned{$a->{name}}++; return 1; # favor a -> front } elsif (! -w &dirname($b->{name})) { warn "Warning: $b->{name} not able to be unlinked!" unless $warned{$b->{name}}++; return -1; # favor b -> front } $b->{nlink} <=> $a->{nlink} } @_; } sub _entries_are_already_hard_linked { my ($entry_a, $entry_b) = @_; if ($entry_a->{ino} == $entry_b->{ino} and $entry_a->{dev} == $entry_b->{dev}) { return 1; } return 0; } sub _entries_sizes_match { my ($entry_a, $entry_b) = @_; if ($entry_a->{size} != $entry_b->{size}) { return 0; } return 1; } sub _entries_contents_match { my ($entry_a, $entry_b) = @_; my $contents_same = (0 == &File::Compare::compare($entry_a->{name}, $entry_b->{name})); # warn about hash collision unless ($contents_same) { warn "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n"; } return $contents_same; } # generate hardlink instructions sub instructions { require Scalar::Util; my $self = shift; # start generating instructions my @inst; foreach my $bucket ($self->buckets) { # of the bucket, find the oldest timestamp my ($oldest_entry) = $self->oldest_mtime(@{$bucket}); # of the bucket, find the file most embedded in the file system my @to_link = $self->more_linked(@{$bucket}); my $most_linked_entry = shift @to_link; foreach my $entry (@to_link) { # XXX there shouldn't be a need to update entries' link counts, # since this generates all the instructions at once push @inst, Directory::Simplify::Instruction::Hardlink->new( source => $most_linked_entry, target => $entry, ); push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( source => $oldest_entry, target => $entry, ); } if (&Scalar::Util::refaddr($most_linked_entry) != &Scalar::Util::refaddr($oldest_entry)) { # most_linked_entry should get its timestamp updated push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( source => $oldest_entry, target => $most_linked_entry, ); } } @inst } 1;