#!/usr/bin/perl #die glob2regex($ARGV[0])."\n"; #Duplicate Killer 1.1 by Tyler Bletsch #1.0 Initial #1.1 Switched to directory search semantics use File::Find; use Digest::MD5; $|=1; $usage = <] [directories ...] Where: filemask: a wildcard expression (not expanded by the shell, default '*') directories: zero or more directories to search (default '.') Options: -k Kill: Delete duplicates instead of simply reporting them. -K Kill without remorse: Just like -k, but doesn't prompt for confirmation -r Recursive: traverse down every directory given -s Simple: Output simple colon-separated listing of duplicates -p Path stats: Output notices on entering and leaving directories For example, to remove all duplicate jpg files in a subdirectory called images: dupekill.pl -k -m*.jpg images EODOC $bufferSize = 65536; $mask = '*'; while ($ARGV[0] =~ /^-/) { $opt = shift; if ($opt eq '-k') { $bKill=1 } elsif ($opt eq '-K') { $bKill=1; $bNoPrompt=1; } elsif ($opt eq '-r') { $bRecurse=1; } elsif ($opt eq '-p') { $bPathStats=1; } elsif ($opt =~ /-m(.*)/) { $mask = $1; } elsif ($opt eq '-f') { $bRecurse=1; } elsif ($opt eq '-s') { $bSimple=1; } else { print $usage; exit; } } @directories = @ARGV ? @ARGV : ('.'); #@directories = ('.') ; $desc = ($bKill ? "Hunting and killing duplicates " : "Passively scanning for duplicates "). "matching '$mask' in paths {".join(',',@directories)."}..."; select STDERR if ($bSimple); print "$desc\n\n"; if ($bKill && !$bNoPrompt) { while (1) { print STDERR "Are you sure you want to do this? [y/n] "; chomp ($resp = lc ); if ($resp eq 'n') { die "Aborted.\n"; } last if ($resp eq 'y'); } print "\n"; } select STDOUT if ($bSimple); $dupeSize = $uniques = $dupes = 0; $maskRegex = glob2regex($mask); #Tell us some stats on break $SIG{INT} = sub { select STDERR; print "BREAK!\n"; &printSummary; exit(0); }; find({preprocess => \&enterDir,postprocess=>\&leaveDir,wanted => sub{/$maskRegex/ and &doFile}}, @directories); &printSummary unless ($bSimple); ############################################################ sub printSummary { $total = $dupes+$uniques; $fracDupe = sprintf("%.1f%%",$dupes/($total || 1)*100); $fracUnique = sprintf("%.1f%%",$uniques/($total || 1)*100); print "\nSummary:\n"; print " $dupes duplicates ".($bKill?"killed":"found")." ($fracDupe, ",sizeInBytes($dupeSize),")\n"; print " $uniques uniques ($fracUnique)\n"; print " $total total files\n"; } sub enterDir { #print "Entering '$File::Find::dir'\n"; print "+ $File::Find::dir\n" if $bPathStats; $filesInDir{$File::Find::dir}=$dupesInDir{$File::Find::dir}=0; # must return the list of stuff to process return $bRecurse ? @_ : grep(-f && /$maskRegex/,@_); } sub leaveDir { #print "Leaving '$File::Find::dir', $dupesInDir/$filesInDir dupes.\n"; print "- $File::Find::dir ($dupesInDir{$File::Find::dir}/$filesInDir{$File::Find::dir})\n" if $bPathStats; } sub doFile { # $_ is the local filename, $File::Find::name is full path filename return unless (-f); $filesInDir{$File::Find::dir}++; $fs = fileSummary($_); #print "$_ $fs\n"; if ($history{$fs}) { $dupes++; $dupeSize += (stat)[7]; $dupesInDir{$File::Find::dir}++; if ($bSimple) { print "$history{$fs}:$File::Find::name\n"; } else { #print "$history{$fs} -> $File::Find::name\n"; printf "%-38s : %-38s\n",$history{$fs},$File::Find::name; } if ($bKill) { unlink or warn "$_: $!\n"; } } else { $history{$fs} = $File::Find::name; $uniques++; } } sub glob2regex { local ($_) = @_; #Thanks Richard F #http://mail.nl.linux.org/xchat-discuss/1999-10/msg00013.html #escape all characters with special meanings in regexs by prefixing them # with \ (eg the chars: ()|\[].^$+{} but not ? and *) s/([\(\)\|\\\[\]\.\^\$\+\{\}])/\\$1/g; #prepend ^ and append $ to the string $_ = "^$_\$"; #replace '*' by '.*' s/\*/.*/g; #replace '?' by '.' s/\?/.?/g; return $_; } sub fileSummary($) { my ($file) = @_; unless (open FP,"< $file") { warn "$file: $!\n"; return undef; } binmode FP,":raw"; $ctx = Digest::MD5->new; $ctx->addfile(*FP); close FP; return $ctx->hexdigest; } sub sizeInBytes { my ($s) = @_; # size return sprintf("%.2f TB",$s/2**40) if ($s >= 2**40); return sprintf("%.2f GB",$s/2**30) if ($s >= 2**30); return sprintf("%.2f MB",$s/2**20) if ($s >= 2**20); return sprintf("%.2f kB",$s/2**10) if ($s >= 2**10); return "$s B"; }