mirror of
https://codeberg.org/h3xx/simplify_static_dir
synced 2024-08-14 23:57:24 +00:00
182 lines
4.8 KiB
Perl
182 lines
4.8 KiB
Perl
package Directory::Simplify::Instruction::Generator;
|
|
use strict;
|
|
use warnings;
|
|
use overload '""' => 'as_string';
|
|
use Carp qw/ carp /;
|
|
require File::Compare;
|
|
|
|
# :squash-remove-start:
|
|
require Directory::Simplify::Instruction::CopyTimestamp;
|
|
require Directory::Simplify::Instruction::Hardlink;
|
|
# :squash-remove-end:
|
|
|
|
sub new {
|
|
my ($class, %args) = @_;
|
|
return bless {
|
|
filehash => undef,
|
|
min_size => 1,
|
|
%args,
|
|
}, $class;
|
|
}
|
|
|
|
sub as_string {
|
|
my $self = shift;
|
|
return join "\n", $self->instructions;
|
|
}
|
|
|
|
sub buckets {
|
|
my $self = shift;
|
|
|
|
my @candidate_lists = $self->{filehash}->entries;
|
|
|
|
my @buckets;
|
|
foreach my $candidates (@candidate_lists) {
|
|
my @ca = @{$candidates}; # make a clone
|
|
my @these_buckets;
|
|
|
|
# at least two files needed to link together
|
|
if (@ca > 1) {
|
|
ELIMINATOR: while (@ca) {
|
|
my $entry = shift @ca;
|
|
|
|
next ELIMINATOR if $self->_entry_should_be_skipped($entry);
|
|
|
|
foreach my $bucket_idx (0 .. $#these_buckets) {
|
|
if (_entries_are_hard_linkable($these_buckets[$bucket_idx]->[0], $entry)) {
|
|
push @{$these_buckets[$bucket_idx]}, $entry;
|
|
next ELIMINATOR;
|
|
}
|
|
}
|
|
# didn't find a bucket (even though the hash matched!)
|
|
# -> new bucket
|
|
push @these_buckets, [$entry];
|
|
}
|
|
}
|
|
push @buckets, @these_buckets;
|
|
}
|
|
|
|
return @buckets;
|
|
}
|
|
|
|
sub _entry_should_be_skipped {
|
|
my ($self, $entry_a) = @_;
|
|
# too small to be hard-linked
|
|
if ($entry_a->{size} < $self->{min_size}) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub _entries_are_hard_linkable {
|
|
my ($entry_a, $entry_b) = @_;
|
|
|
|
# obviously, if the sizes aren't the same, they're not the same file
|
|
unless (_entries_sizes_match($entry_a, $entry_b)) {
|
|
return 0;
|
|
}
|
|
|
|
# they're the same file, don't try it
|
|
if (_entries_are_already_hard_linked($entry_a, $entry_b)) {
|
|
return 0;
|
|
}
|
|
|
|
if (_entries_contents_match($entry_a, $entry_b)) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
sub _oldest_mtime {
|
|
my @entries = @_;
|
|
my @sorted = sort {
|
|
$a->{mtime} <=> $b->{mtime}
|
|
} @entries;
|
|
return @sorted;
|
|
}
|
|
|
|
sub _more_linked {
|
|
my @entries = @_;
|
|
my @sorted = sort {
|
|
$b->{nlink} <=> $a->{nlink}
|
|
} @entries;
|
|
return @sorted;
|
|
}
|
|
|
|
sub _entries_are_already_hard_linked {
|
|
my ($entry_a, $entry_b) = @_;
|
|
|
|
if ($entry_a->{ino} == $entry_b->{ino} and
|
|
$entry_a->{dev} == $entry_b->{dev}) {
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
sub _entries_sizes_match {
|
|
my ($entry_a, $entry_b) = @_;
|
|
if ($entry_a->{size} != $entry_b->{size}) {
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
sub _entries_contents_match {
|
|
my ($entry_a, $entry_b) = @_;
|
|
|
|
my $contents_same = (0 == File::Compare::compare($entry_a->{name}, $entry_b->{name}));
|
|
|
|
# warn about hash collision
|
|
unless ($contents_same) {
|
|
carp "Hash collision between files:\n* $entry_a->{name}\n* $entry_b->{name}\n (don't panic)\n";
|
|
}
|
|
return $contents_same;
|
|
}
|
|
|
|
# generate hardlink instructions
|
|
sub instructions {
|
|
require Scalar::Util;
|
|
my $self = shift;
|
|
|
|
# start generating instructions
|
|
my @inst;
|
|
foreach my $bucket ($self->buckets) {
|
|
|
|
# Of the bucket, find the oldest timestamp, regardless of read-only
|
|
my ($oldest_entry) = _oldest_mtime(@{$bucket});
|
|
|
|
# Limit link/unlink operations to files in non-readonly directories
|
|
my @non_readonly;
|
|
foreach my $entry (@{$bucket}) {
|
|
unless (-w $entry->{dirname}) {
|
|
carp "Warning: $entry->{name} not able to be unlinked!";
|
|
}
|
|
push @non_readonly, $entry;
|
|
}
|
|
|
|
# Of the linkable files, find the file most embedded in the file system
|
|
my @to_link = _more_linked(@non_readonly);
|
|
my $most_linked_entry = shift @to_link;
|
|
foreach my $entry (@to_link) {
|
|
# XXX there shouldn't be a need to update entries' link counts,
|
|
# since this generates all the instructions at once
|
|
push @inst, Directory::Simplify::Instruction::Hardlink->new(
|
|
source => $most_linked_entry,
|
|
target => $entry,
|
|
);
|
|
push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
|
|
source => $oldest_entry,
|
|
target => $entry,
|
|
);
|
|
}
|
|
if (Scalar::Util::refaddr($most_linked_entry) != Scalar::Util::refaddr($oldest_entry)) {
|
|
# most_linked_entry should get its timestamp updated
|
|
push @inst, Directory::Simplify::Instruction::CopyTimestamp->new(
|
|
source => $oldest_entry,
|
|
target => $most_linked_entry,
|
|
);
|
|
}
|
|
}
|
|
return @inst;
|
|
}
|
|
|
|
1;
|