package Directory::Simplify::Instruction::Generator; use strict; use warnings; use overload '""' => 'as_string'; use Carp qw/ carp /; require File::Compare; # :squash-remove-start: require Directory::Simplify::Instruction::CopyTimestamp; require Directory::Simplify::Instruction::Hardlink; # :squash-remove-end: sub new { my ($class, %args) = @_; return bless { filehash => undef, min_size => 1, %args, }, $class; } sub as_string { my $self = shift; return join "\n", $self->instructions; } sub buckets { my $self = shift; my @candidate_lists = $self->{filehash}->entries; my @buckets; foreach my $candidates (@candidate_lists) { my @ca = @{$candidates}; # make a clone my @these_buckets; # at least two files needed to link together if (@ca > 1) { ELIMINATOR: while (@ca) { my $entry = shift @ca; next ELIMINATOR if $self->_entry_should_be_skipped($entry); foreach my $bucket_idx (0 .. $#these_buckets) { if (_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) { push @{$these_buckets[$bucket_idx]}, $entry; next ELIMINATOR; } } # didn't find a bucket (even though the hash matched!) # -> new bucket push @these_buckets, [$entry]; } } push @buckets, @these_buckets; } return @buckets; } sub _entry_should_be_skipped { my ($self, $entry_a) = @_; # too small to be hard-linked if ($entry_a->{size} < $self->{min_size}) { return 1; } return 0; } sub _entries_are_hard_linkable { my ($entry_a, $entry_b) = @_; # obviously, if the sizes aren't the same, they're not the same file unless (_entries_sizes_match($entry_a, $entry_b)) { return 0; } # they're the same file, don't try it if (_entries_are_already_hard_linked($entry_a, $entry_b)) { return 0; } if (_entries_contents_match($entry_a, $entry_b)) { return 1; } return 0; } sub _oldest_mtime { my @entries = @_; my @sorted = sort { $a->{mtime} <=> $b->{mtime} } @entries; return @sorted; } sub _more_linked { my @entries = @_; my @sorted = sort { $b->{nlink} <=> $a->{nlink} } @entries; return @sorted; } sub _entries_are_already_hard_linked { my ($entry_a, $entry_b) = @_; if ($entry_a->{ino} == $entry_b->{ino} and $entry_a->{dev} == $entry_b->{dev}) { return 1; } return 0; } sub _entries_sizes_match { my ($entry_a, $entry_b) = @_; if ($entry_a->{size} != $entry_b->{size}) { return 0; } return 1; } sub _entries_contents_match { my ($entry_a, $entry_b) = @_; my $contents_same = (0 == File::Compare::compare($entry_a->{name}, $entry_b->{name})); # warn about hash collision unless ($contents_same) { carp "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n"; } return $contents_same; } # generate hardlink instructions sub instructions { require Scalar::Util; my $self = shift; # start generating instructions my @inst; foreach my $bucket ($self->buckets) { # Of the bucket, find the oldest timestamp, regardless of read-only my ($oldest_entry) = _oldest_mtime(@{$bucket}); # Limit link/unlink operations to files in non-readonly directories my (@non_readonly, @readonly); foreach my $entry (@{$bucket}) { unless (-w $entry->{dirname}) { carp "Warning: $entry->{name} not able to be unlinked!"; push @readonly, $entry; } else { push @non_readonly, $entry; } } # Of the linkable files, find the file most embedded in the file system my @to_link = _more_linked(@non_readonly); @readonly = _more_linked(@readonly); # Select a basis for linkage, either the most-linked readonly entry (if # any) or the most linked of the read-write entries. my $most_linked_entry = shift @readonly // shift @to_link; foreach my $entry (@to_link) { # XXX there shouldn't be a need to update entries' link counts, # since this generates all the instructions at once push @inst, Directory::Simplify::Instruction::Hardlink->new( source => $most_linked_entry, target => $entry, ); push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( source => $oldest_entry, target => $entry, ); } if (Scalar::Util::refaddr($most_linked_entry) != Scalar::Util::refaddr($oldest_entry)) { # most_linked_entry should get its timestamp updated push @inst, Directory::Simplify::Instruction::CopyTimestamp->new( source => $oldest_entry, target => $most_linked_entry, ); } } return @inst; } 1;