mirror of
				https://codeberg.org/h3xx/simplify_static_dir
				synced 2024-08-14 23:57:24 +00:00 
			
		
		
		
	Initial commit
This commit is contained in:
		
						commit
						801bed9fa9
					
				
					 1 changed files with 398 additions and 0 deletions
				
			
		
							
								
								
									
										398
									
								
								simplify_static_dir.pl
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										398
									
								
								simplify_static_dir.pl
									
										
									
									
									
										Executable file
									
								
							|  | @ -0,0 +1,398 @@ | ||||||
|  | #!/usr/bin/perl | ||||||
|  | # vi: et sts=4 sw=4 ts=4 | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  | 
 | ||||||
|  | our $VERSION = '1.2.2'; | ||||||
|  | 
 | ||||||
|  | =pod | ||||||
|  | 
 | ||||||
|  | =head1 NAME | ||||||
|  | 
 | ||||||
|  | simplify_static_dir - optimize directories for size by combining identical | ||||||
|  | files | ||||||
|  | 
 | ||||||
|  | =head1 SYNOPSIS | ||||||
|  | 
 | ||||||
|  | B<simplify_static_dir> [I<OPTIONS>] [I<DIR>]... | ||||||
|  | 
 | ||||||
|  | =head1 DESCRIPTION | ||||||
|  | 
 | ||||||
|  | The more files this script can process at once, the better. It maintains an | ||||||
|  | internal hash of files indexed by their SHA1 checksum. When it finds a | ||||||
|  | collision it removes the file with the least amount of file system links, then | ||||||
|  | creates a hard link to the other in its place. The larger the number of files | ||||||
|  | scanned, the more likely it is that there will be collisions. | ||||||
|  | 
 | ||||||
|  | There are failsafes in place, though. If somehow two files' SHA1 checksums are | ||||||
|  | identical, but the file sizes do not match, this program will error out (and | ||||||
|  | you can go ahead and report the collision; it may be worth money). | ||||||
|  | 
 | ||||||
|  | There are other features built in as well. Following the logic that unique | ||||||
|  | data, once created has the attribute of being unique of that point in time, | ||||||
|  | this script will copy the timestamp of the older file onto the newer file just | ||||||
|  | before it makes the hard link. Therefore, many times, simplified directories | ||||||
|  | will have the appearance of being older than they actually are. | ||||||
|  | 
 | ||||||
|  | From the perspective of any program reading files from the simplified | ||||||
|  | directories, the files will lookB<*> and behave the same way as the initial | ||||||
|  | state. | ||||||
|  | 
 | ||||||
|  | B<*> Except for having an increased number of hard links. | ||||||
|  | 
 | ||||||
|  | =head1 OPTIONS | ||||||
|  | 
 | ||||||
|  | =over | ||||||
|  | 
 | ||||||
|  | =item B<-v> | ||||||
|  | 
 | ||||||
|  | Verbose output. | ||||||
|  | 
 | ||||||
|  | =item B<-f> | ||||||
|  | 
 | ||||||
|  | Print a sum of the number of freed bytes. | ||||||
|  | 
 | ||||||
|  | =item B<-m> I<REGEX> | ||||||
|  | 
 | ||||||
|  | Only match file paths matching I<REGEX>. | ||||||
|  | 
 | ||||||
|  | =item B<-M> I<REGEX> | ||||||
|  | 
 | ||||||
|  | Exclude file paths matching I<REGEX>. | ||||||
|  | 
 | ||||||
|  | =item B<-s> | ||||||
|  | 
 | ||||||
|  | Generate symlinks only. | ||||||
|  | 
 | ||||||
|  | =item B<-S> | ||||||
|  | 
 | ||||||
|  | Do not generate ANY symlinks. | ||||||
|  | 
 | ||||||
|  | =item B<-z> | ||||||
|  | 
 | ||||||
|  | Include zero-length files in search. Normally they are ignored (you don't save | ||||||
|  | diskspace by hard-linking empty files). | ||||||
|  | 
 | ||||||
|  | =item B<--help> | ||||||
|  | 
 | ||||||
|  | Output this help message and exit. | ||||||
|  | 
 | ||||||
|  | =item B<--version> | ||||||
|  | 
 | ||||||
|  | Output version information and exit. | ||||||
|  | 
 | ||||||
|  | =back | ||||||
|  | 
 | ||||||
|  | By default, scans the current directory. Files not able to be hard-linked are | ||||||
|  | symlinked by default. | ||||||
|  | 
 | ||||||
|  | =head1 CHANGES | ||||||
|  | 
 | ||||||
|  | =over | ||||||
|  | 
 | ||||||
|  | =item 1.1.0 | ||||||
|  | 
 | ||||||
|  | Outputs GNU-style messages (ala `rm -v,' `ln -v,' etc.). | ||||||
|  | 
 | ||||||
|  | =item 1.1.1 | ||||||
|  | 
 | ||||||
|  | Added B<-m> and B<-M> options. | ||||||
|  | 
 | ||||||
|  | =item 1.1.2 | ||||||
|  | 
 | ||||||
|  | Added B<-z> option. Now the default behavior is to not process empty files. | ||||||
|  | Because what's the point of freeing up no space? | ||||||
|  | 
 | ||||||
|  | =item 1.2.0 | ||||||
|  | 
 | ||||||
|  | Uses L<Digest::SHA> instead of L<Digest::MD5>. MD5 has been broken. | ||||||
|  | 
 | ||||||
|  | =item 1.2.1 | ||||||
|  | 
 | ||||||
|  | Fixed bug when processing files with \r characters in the name. | ||||||
|  | 
 | ||||||
|  | =back | ||||||
|  | 
 | ||||||
|  | =head1 COPYRIGHT | ||||||
|  | 
 | ||||||
|  | Copyright (C) 2010-2013 Dan Church. | ||||||
|  | 
 | ||||||
|  | License GPLv3+: GNU GPL version 3 or later (L<http://gnu.org/licenses/gpl.html>). | ||||||
|  | 
 | ||||||
|  | This is free software: you are free to change and redistribute it. | ||||||
|  | 
 | ||||||
|  | There is NO WARRANTY, to the extent permitted by law. | ||||||
|  | 
 | ||||||
|  | =head1 AUTHOR | ||||||
|  | 
 | ||||||
|  | Written by Dan Church S<E<lt>amphetamachine@gmail.comE<gt>> | ||||||
|  | 
 | ||||||
|  | =cut | ||||||
|  | 
 | ||||||
|  | use File::Find qw/ find /; | ||||||
|  | require Digest::SHA; | ||||||
|  | use Getopt::Std qw/ getopts /; | ||||||
|  | require Pod::Text; | ||||||
|  | 
 | ||||||
|  | sub HELP_MESSAGE { | ||||||
|  | #   my $fh = shift; | ||||||
|  | #   print $fh <<EOF | ||||||
|  | #Usage: $0 [DIRS] | ||||||
|  | #Simplify a directory by hard-linking identical files. | ||||||
|  | # | ||||||
|  | #  -v           Verbose output. | ||||||
|  | #  -f           Print a sum of the number of freed bytes. | ||||||
|  | #  -m REGEX     Only match file paths matching REGEX. | ||||||
|  | #  -M REGEX     Exclude file paths matching REGEX. | ||||||
|  | #  -s           Generate symlinks only. | ||||||
|  | #  -S           Do not generate ANY symlinks. | ||||||
|  | #  -z           Include zero-length files in search. | ||||||
|  | # | ||||||
|  | #By default, scans the current directory. Files not able to be hard-linked are | ||||||
|  | #symlinked by default. | ||||||
|  | #EOF | ||||||
|  | #; | ||||||
|  |     my ($fh, $pod) = (shift, Pod::Text->new); | ||||||
|  |     $pod->parse_from_file($0, $fh); | ||||||
|  | 
 | ||||||
|  |     exit 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | my %opts = ( | ||||||
|  |     v => 0, | ||||||
|  |     f => 0, | ||||||
|  |     m => '', | ||||||
|  |     M => '', | ||||||
|  |     s => 0, | ||||||
|  |     S => 0, | ||||||
|  |     z => 0, | ||||||
|  | ); | ||||||
|  | 
 | ||||||
|  | &getopts('vfm:M:sSz', \%opts); | ||||||
|  | 
 | ||||||
|  | my %files; | ||||||
|  | 
 | ||||||
|  | # correct relative paths | ||||||
|  | # OR if no directories given, search the current directory | ||||||
|  | push @ARGV, $ENV{PWD} unless map { s#^([^/])#$ENV{PWD}/$1# } @ARGV; | ||||||
|  | 
 | ||||||
|  | my $freed_bytes = 0; | ||||||
|  | 
 | ||||||
|  | &find(\&findexec, @ARGV); | ||||||
|  | 
 | ||||||
|  | sub findexec { | ||||||
|  |     # outright skip directories (don't report skip) | ||||||
|  |     return if -d $File::Find::name; | ||||||
|  | 
 | ||||||
|  |     # limit to or exclude file patterns specified by `-m' or `-M', | ||||||
|  |     # respectively | ||||||
|  | 
 | ||||||
|  |     # truth table | ||||||
|  |     # -m matches    | -M is used & matches  | !return? | ||||||
|  |     # 0     | 0         | 0 | ||||||
|  |     # 0     | 1         | 0 | ||||||
|  |     # 1     | 0         | 1 | ||||||
|  |     # 1     | 1         | 0 | ||||||
|  |     # note: m// will match everything | ||||||
|  |     unless ($File::Find::name =~ m/$opts{m}/ and | ||||||
|  |         !(length $opts{M} and $File::Find::name =~ m/$opts{M}/)) { | ||||||
|  | 
 | ||||||
|  |         print STDERR "Skipping path `$File::Find::name'\n" | ||||||
|  |             if $opts{v}; | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     # make sure the file exists and it's not a link | ||||||
|  |     if (-f $File::Find::name && ! -l $File::Find::name) { | ||||||
|  |         #my $ctx = Digest::MD5->new; | ||||||
|  |         my $ctx = Digest::SHA->new; | ||||||
|  |         $ctx->addfile($File::Find::name); | ||||||
|  | 
 | ||||||
|  |         # save the hex digest because reading the value from | ||||||
|  |         # Digest::* destroys it | ||||||
|  |         my $digest = $ctx->hexdigest; | ||||||
|  | 
 | ||||||
|  |         # organize results from lstat into hash | ||||||
|  |         my $entry = {}; | ||||||
|  |         (@{$entry}{qw/ name dev ino mode nlink uid gid rdev size | ||||||
|  |                 atime mtime ctime blksize blocks /}) | ||||||
|  |             = ($File::Find::name, lstat $File::Find::name); | ||||||
|  | 
 | ||||||
|  |         # skip zero-length files if wanted (`-z') | ||||||
|  |         # truth table: | ||||||
|  |         # -z | non-zero length | return? | ||||||
|  |         # 0  | 0               | 1 | ||||||
|  |         # 0  | 1               | 0 | ||||||
|  |         # 1  | 0               | 0 | ||||||
|  |         # 1  | 1               | 0 | ||||||
|  |         return unless $opts{z} or $entry->{size}; | ||||||
|  | 
 | ||||||
|  |         # check to see if we've come across a file with the same crc | ||||||
|  |         if (exists $files{$digest}) { | ||||||
|  |             my $curr_entry = $files{$digest}; | ||||||
|  | 
 | ||||||
|  |             # don't waste my time | ||||||
|  |             return if $curr_entry->{name} eq $entry->{name} or | ||||||
|  |                 $curr_entry->{ino} == $entry->{ino}; | ||||||
|  | 
 | ||||||
|  |             # identical files should be the same size (I'm paranoid | ||||||
|  |             # of checksumming procedures); if it's not, there's a | ||||||
|  |             # problem with the checksumming procedure or this | ||||||
|  |             # script is processing WAY too many files | ||||||
|  |             # (addendum: perhaps I should congratulate the user on | ||||||
|  |             # finding a collision in SHA-1) | ||||||
|  |             if ($curr_entry->{size} != $entry->{size}) { | ||||||
|  | die "ERROR: checksums identical for two non-identical files!!!:\n". | ||||||
|  |     "files:\t`$curr_entry->{name}'\n". | ||||||
|  |           "\t`$entry->{name}'\n". | ||||||
|  |     "SHA1: ($digest)\n". | ||||||
|  |     '(this is probably a limit of SHA1; try processing fewer files)'; | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             # find the oldest time stamp | ||||||
|  |             my ($atime, $mtime) = @{(sort | ||||||
|  |                 {$a->{mtime} <=> $b->{mtime}} | ||||||
|  |                 ($entry, $curr_entry) | ||||||
|  |             )[0]}{qw/ atime mtime /}; | ||||||
|  | 
 | ||||||
|  |             # find the file less embedded in the file system | ||||||
|  |             my ($less_linked, $more_linked) = sort | ||||||
|  |                 {$a->{nlink} <=> $b->{nlink}} | ||||||
|  |                 ($entry, $curr_entry); | ||||||
|  | 
 | ||||||
|  |             # hard-linkable files must exist on the same device and | ||||||
|  |             # must not already be hard-linked | ||||||
|  |             if ($curr_entry->{dev} == $entry->{dev} && | ||||||
|  |                 ! $opts{s}) { | ||||||
|  | #               print "hard-linking $new_file\t=>$old_file\n"; | ||||||
|  |                 # attempt to unlink the file | ||||||
|  |                 printf STDERR "removing file `%s'\n", | ||||||
|  |                     $less_linked->{name} if $opts{v}; | ||||||
|  |                 unless (unlink $less_linked->{name}) { | ||||||
|  | 
 | ||||||
|  |                     # couldn't do it; try more-linked file | ||||||
|  | 
 | ||||||
|  |                     printf STDERR <<EOF | ||||||
|  | Failed to remove file `%s': %s | ||||||
|  | (using `%s') | ||||||
|  | EOF | ||||||
|  | , | ||||||
|  |                     $less_linked->{name}, | ||||||
|  |                     $!, | ||||||
|  |                     $more_linked->{name} | ||||||
|  |                         if $opts{v}; | ||||||
|  | 
 | ||||||
|  |                     # if we can't do this, there's no point | ||||||
|  |                     # in continuing | ||||||
|  |                     unless (unlink $more_linked->{name}) { | ||||||
|  | printf STDERR <<EOF | ||||||
|  | Failed to remove file `%s' (second failure on match): %s | ||||||
|  | Skipping... | ||||||
|  | EOF | ||||||
|  | , | ||||||
|  |                         $more_linked->{name}, | ||||||
|  |                         $! | ||||||
|  |                             if $opts{v}; | ||||||
|  | 
 | ||||||
|  |                         return; | ||||||
|  |                     } | ||||||
|  | 
 | ||||||
|  |                     # the ol' switcheroo | ||||||
|  |                     ($more_linked, $less_linked) = | ||||||
|  |                     ($less_linked, $more_linked); | ||||||
|  | 
 | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 # we unlinked it or failed out | ||||||
|  |                 $freed_bytes += $less_linked->{size} | ||||||
|  |                     unless $less_linked->{nlink} > 1; | ||||||
|  | 
 | ||||||
|  |                 printf STDERR "hard linking `%s' => `%s'\n", | ||||||
|  |                 $less_linked->{name}, $more_linked->{name} | ||||||
|  |                 if $opts{v}; | ||||||
|  | 
 | ||||||
|  |                 # hard link the files | ||||||
|  |                 link $more_linked->{name}, | ||||||
|  |                 $less_linked->{name}; | ||||||
|  | 
 | ||||||
|  |                 # update link count in our hash to reflect the | ||||||
|  |                 # file system (referenced) | ||||||
|  |                 ++$more_linked->{nlink}; | ||||||
|  | 
 | ||||||
|  |                 # preserve older time stamp | ||||||
|  |                 utime $atime, $mtime, $less_linked->{name}; | ||||||
|  |             } elsif (! $opts{S}) { | ||||||
|  |                 # files are on different drives; | ||||||
|  |                 # most that can be done is to soft-link them | ||||||
|  | 
 | ||||||
|  |                 unless (unlink $less_linked->{name}) { | ||||||
|  | 
 | ||||||
|  |                     # couldn't do it; try more-linked file | ||||||
|  | 
 | ||||||
|  |                     printf STDERR "couldn't remove file `%s' (using `%s')\n", | ||||||
|  |                     $less_linked->{name}, | ||||||
|  |                     $more_linked->{name} if $opts{v}; | ||||||
|  | 
 | ||||||
|  |                     # if we can't do this, there's no point | ||||||
|  |                     # in continuing | ||||||
|  |                     unlink $more_linked->{name} | ||||||
|  |                         or return; | ||||||
|  | 
 | ||||||
|  |                     # the ol' switcheroo | ||||||
|  |                     ($more_linked, $less_linked) = | ||||||
|  |                     ($less_linked, $more_linked); | ||||||
|  | 
 | ||||||
|  |                 } | ||||||
|  | 
 | ||||||
|  |                 # we unlinked it or failed out | ||||||
|  |                 $freed_bytes += $less_linked->{size}; | ||||||
|  | 
 | ||||||
|  |                 printf STDERR "soft-linking %s => %s\n", | ||||||
|  |                 $less_linked->{name}, $more_linked->{name} | ||||||
|  |                 if $opts{v}; | ||||||
|  | 
 | ||||||
|  |                 # create a soft link (TODO: relative links) | ||||||
|  |                 symlink $more_linked->{name}, | ||||||
|  |                 $less_linked->{name}; | ||||||
|  | 
 | ||||||
|  |                 # preserve older time stamp | ||||||
|  |                 utime $atime, $mtime, $less_linked->{name}; | ||||||
|  |             } | ||||||
|  |         } else { | ||||||
|  |             # the file is unique (as far as we know) | ||||||
|  |             # create a new entry in the hash table | ||||||
|  |             $files{$digest} = $entry; | ||||||
|  |         } | ||||||
|  |     #} elsif (-l $File::Find::name) { | ||||||
|  |     #   # do something to simplify symlinks | ||||||
|  |     #   printf STDERR "FIXME: simplifying symlink `%s'\n", | ||||||
|  |     #   $File::Find::name | ||||||
|  |     #   if $opts{v}; | ||||||
|  | 
 | ||||||
|  |     #   printf STDERR "symlink `%s' points to `%s'\n", | ||||||
|  |     #   $File::Find::name, readlink $File::Find::name; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | printf STDERR "freed %d bytes (%0.4G %s)\n", | ||||||
|  |     $freed_bytes, &hr_size($freed_bytes) | ||||||
|  |         if $opts{f} or $opts{v}; | ||||||
|  | 
 | ||||||
|  | sub hr_size { | ||||||
|  |     my $sz = shift; | ||||||
|  |     my @sizes = qw/ B KB MB GB TB PB EB ZB YB /; | ||||||
|  |     my $fact = 1024; | ||||||
|  |     my $thresh = 0.1; | ||||||
|  |     my @ret; | ||||||
|  |     foreach my $exp (reverse 0 .. $#sizes) { | ||||||
|  |         if ($sz > (1 - $thresh) * $fact ** $exp) { | ||||||
|  |             @ret = ($sz / $fact ** $exp, $sizes[$exp]); | ||||||
|  |             last; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     # default to ($sz, 'bytes') | ||||||
|  |     @ret = ($sz, $sizes[0]) unless @ret; | ||||||
|  | 
 | ||||||
|  |     wantarray ? @ret : "@ret" | ||||||
|  | } | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue