#!/usr/bin/perl

#die glob2regex($ARGV[0])."\n";

#Duplicate Killer 1.1 by Tyler Bletsch
#1.0 Initial
#1.1 Switched to directory search semantics

use File::Find;
use Digest::MD5;

$|=1;

$usage = <<EODOC;
Duplicate Killer 1.1 by Tyler Bletsch

Usage: 
  dupekill.pl [-r] [-k] [-m<filemask>] [directories ...]

Where:
  filemask: a wildcard expression (not expanded by the shell, default '*')
  directories: zero or more directories to search (default '.')

Options:
  -k Kill: Delete duplicates instead of simply reporting them.
  -K Kill without remorse: Just like -k, but doesn't prompt for confirmation
  -r Recursive: traverse down every directory given
  -s Simple: Output simple colon-separated listing of duplicates
  -p Path stats: Output notices on entering and leaving directories

For example, to remove all duplicate jpg files in a subdirectory called images:
  dupekill.pl -k -m*.jpg images

EODOC

$bufferSize = 65536;


$mask = '*';

while ($ARGV[0] =~ /^-/) {
	$opt = shift;
	if ($opt eq '-k') {
		$bKill=1
	} elsif ($opt eq '-K') {
		$bKill=1;
		$bNoPrompt=1;
	} elsif ($opt eq '-r') {
		$bRecurse=1;
	} elsif ($opt eq '-p') {
		$bPathStats=1;
	} elsif ($opt =~ /-m(.*)/) {
		$mask = $1;
	} elsif ($opt eq '-f') {
		$bRecurse=1;
	} elsif ($opt eq '-s') {
		$bSimple=1;
	} else {
		print $usage;
		exit;
	}
}

@directories = @ARGV ? @ARGV : ('.');
#@directories = ('.') ;

$desc = ($bKill ?
	"Hunting and killing duplicates " :
	"Passively scanning for duplicates ").
	"matching '$mask' in paths {".join(',',@directories)."}...";

select STDERR if ($bSimple);

print "$desc\n\n";

if ($bKill && !$bNoPrompt) {
	while (1) {
		print STDERR "Are you sure you want to do this? [y/n] ";
		chomp ($resp = lc <STDIN>);
		if ($resp eq 'n') { die "Aborted.\n"; }
		last if ($resp eq 'y');
	}
	print "\n";
}

select STDOUT if ($bSimple);


$dupeSize = $uniques = $dupes = 0;

$maskRegex = glob2regex($mask);

#Tell us some stats on break
$SIG{INT} = sub { select STDERR; print "BREAK!\n"; &printSummary; exit(0); };

find({preprocess => \&enterDir,postprocess=>\&leaveDir,wanted => sub{/$maskRegex/ and &doFile}}, @directories);

&printSummary unless ($bSimple);

############################################################

sub printSummary {
	$total = $dupes+$uniques;
	$fracDupe = sprintf("%.1f%%",$dupes/($total || 1)*100);
	$fracUnique = sprintf("%.1f%%",$uniques/($total || 1)*100);
	print "\nSummary:\n";
	print "  $dupes duplicates ".($bKill?"killed":"found")." ($fracDupe, ",sizeInBytes($dupeSize),")\n";
	print "  $uniques uniques ($fracUnique)\n";
	print "  $total total files\n";
}

sub enterDir {
	#print "Entering '$File::Find::dir'\n";
	print "+ $File::Find::dir\n" if $bPathStats;
	$filesInDir{$File::Find::dir}=$dupesInDir{$File::Find::dir}=0;
	
	# must return the list of stuff to process
	return $bRecurse ? @_ : grep(-f && /$maskRegex/,@_); 
}

sub leaveDir {
	#print "Leaving '$File::Find::dir', $dupesInDir/$filesInDir dupes.\n";
	print "- $File::Find::dir ($dupesInDir{$File::Find::dir}/$filesInDir{$File::Find::dir})\n" if $bPathStats;
}


sub doFile {
	# $_ is the local filename, $File::Find::name is full path filename
	return unless (-f);
	$filesInDir{$File::Find::dir}++;
	
	$fs = fileSummary($_);
	#print "$_  $fs\n";
	if ($history{$fs}) {
		$dupes++;
		$dupeSize += (stat)[7];
		$dupesInDir{$File::Find::dir}++;
		if ($bSimple) {
			print "$history{$fs}:$File::Find::name\n";
		} else {
			#print "$history{$fs} -> $File::Find::name\n";
			printf "%-38s : %-38s\n",$history{$fs},$File::Find::name;
		}
		if ($bKill) {
			unlink or warn "$_: $!\n";
		}
	} else {
		$history{$fs} = $File::Find::name;
		$uniques++;
	}
}

sub glob2regex {
	local ($_) = @_;
	
	#Thanks Richard F <dfx@gmx.at>
	#http://mail.nl.linux.org/xchat-discuss/1999-10/msg00013.html
	
	#escape all characters with special meanings in regexs by prefixing them	
	#  with \ (eg the chars: ()|\[].^$+{} but not ? and *)
	s/([\(\)\|\\\[\]\.\^\$\+\{\}])/\\$1/g;
	
	#prepend ^ and append $ to the string
	$_ = "^$_\$";
	
	#replace '*' by '.*'
	s/\*/.*/g;
	
	#replace '?' by '.'
	s/\?/.?/g;
	
	return $_;
}

sub fileSummary($) {
	my ($file) = @_;
	
	unless (open FP,"< $file") {
		warn "$file: $!\n";
		return undef;
	}
	binmode FP,":raw";
	
	$ctx = Digest::MD5->new;
	$ctx->addfile(*FP);
	
	close FP;

	return $ctx->hexdigest;
}

sub sizeInBytes {
	my ($s) = @_; # size
	return sprintf("%.2f TB",$s/2**40) if ($s >= 2**40);
	return sprintf("%.2f GB",$s/2**30) if ($s >= 2**30);
	return sprintf("%.2f MB",$s/2**20) if ($s >= 2**20);
	return sprintf("%.2f kB",$s/2**10) if ($s >= 2**10);
	return "$s B";
}