#!/usr/bin/env newlisp ;; @module find-duplicate-files ;; @author cormullion ;; @version 0.0.3 2007-12-02 09:28:45 ;; ;; This newLISP file finds duplicate files. MacOS-specific but could probably ;; be adapted. It ignores filenames, testing sizes and MacOS resource forks. ;; Options: move to a duplicates folder, add comment to spotlight, set finder label ;; Usage: find-duplicate-files.lsp [folder1 folder2 ...] [-usage] [-move] [-move-dummy] [-spotlight] [-label] ;; doesn't like files with single quotes in the names (Unix escaping problems)! ;; Good for 50000 files and possibly more. ;; @syntax (set-spotlight-comment file comment) ;; @param file pathname of file ;; @param comment text to assign to file ;; Set the comment field of a file so that Spotlight can find it. (define (set-spotlight-comment file comment) "set Spotlight comment of file to comment" (exec (format [text]osascript -e 'set pf to POSIX file "%s" ' -e 'tell application "Finder" to set comment of pf to "%s" ' [/text] file comment))) ;; @syntax (set-finder-label file int-label-index) ;; @param file pathname of file ;; @param int-label-index ;; Set the Finder label of a file. (define (set-finder-label file (colour 0)) (let (colours '(black red orange yellow green purple blue grey)) (set 'colour (find colour colours)) (exec (format [text]osascript -e 'set pf to POSIX file "%s"' -e 'tell application "Finder" to set label index of pf to %d'[/text] file colour)))) ;; @syntax (walk-tree folder) ;; @param a folder to walk ;; Recursively examine folder and built a list of the ;; files, their sizes, and their resource fork sizes too. (define (walk-tree folder , item-name ) " build a list of all files in the folder, with sizes: ((size1 file1) (size2 file2) ... )" (dolist (item (directory folder)) (set 'item-name (string folder "/" item)) (if (and (directory? item-name) (!= item ".") (!= item "..")) (walk-tree item-name) ; recurse ; else process the item (and (not (starts-with item ".")) ; skip hidden files (set 'path-name (real-path item-name)) (file-info path-name) ; skip symlinks... (set 'dataforksize (first (file-info path-name))) (if (file? (format {%s/..namedfork/rsrc} path-name )) ; add resource fork size if one exists at /..namedfork/rsrc (begin (set 'resourceforksize (first (file-info (format {%s/..namedfork/rsrc} path-name )))) ) (set 'resourceforksize 0)) ; put composite file size and file name into dupe-list (push (cons (+ dataforksize resourceforksize ) path-name ) dupe-list -1 ))))) ; start (if (find "-usage" (main-args)) (println [text] Usage: find-duplicate-files.lsp [folder1 folder2 ...] [-usage] [-move] [-move-dummy] [-spotlight] [-label] Find duplicate files in the specified folders ... or in the current folder and subfolders. -move moves one of the duplicates to a folder 'duplicates' in the current folder -move-dummy pretends to move - you can see what might happen... -spotlight puts the word 'duplicate' in the comments field of the file -label sets the Finder label to red [/text] (exit))) (if (find "-move" (main-args)) (set 'move true) (set 'move nil)) (if (find "-label" (main-args)) (set 'label true) (set 'label nil)) (if (find "-move-dummy" (main-args)) (set 'move-dummy true) (set 'move-dummy nil)) (if (find "-spotlight" (main-args)) (set 'spotlight true) (set 'spotlight nil)) ; drop first argument (newlisp) and remove options (set 'file-args (rest (clean (fn (arg) (starts-with arg "-")) (main-args)))) (if (> (length file-args) 1) (dolist (folder file-args) (println "... gathering files in folder " (real-path folder) "\n") (walk-tree folder)) (begin (println "... gathering files in folder " (real-path) "\n") (walk-tree (real-path)))) (println "... sorting " (length dupe-list) " items\n") (set 'dupe-list (sort dupe-list )) ; sort by size - very important! (println "... duplicates are: \n") ; see if two adjacent items have the same size ; this is a kludge to avoid an error ; If we start with item 1, we have no 'previous' pair for comparison ; I'd really like to start at item 2... (set 'previous (last dupe-list)) (dolist (current dupe-list) (if (= (first current) (first previous)) ; current same size as previous? (and ; same size, compare md5 checksums (set 'current-dataforkmd5 (exec (format {md5 -q '%s'} (last current)))) ; fails if file contains quotes (set 'current-resourceforkmd5 (exec (format {md5 -q '%s/..namedfork/rsrc'} (string (last current))))) (set 'previous-dataforkmd5 (exec (format {md5 -q '%s'} (last previous)))) (set 'previous-resourceforkmd5 (exec (format {md5 -q '%s/..namedfork/rsrc'} (last previous)))) (and (> (+ (first current ) (first previous) 0)) ; not 0 (= current-dataforkmd5 previous-dataforkmd5 ) (= current-resourceforkmd5 previous-resourceforkmd5) (println (format " %12d %s" (first previous) (last previous))) (println (format " = %12d %s" (first current) (last current))) ; do we want to set the spotlight comments (if spotlight (set-spotlight-comment (last current) (string "duplicate " (last previous))) true) ; continue the 'and' if we didn't do comments ; or we can set Finder colours... (if label (begin (set-finder-label (last current) 'red) (println (dup { } 10) {labelled!})) true) (if (or move move-dummy) (begin (set 'parent (join (reverse (rest (reverse (parse (real-path (last current)) {/}))) ) "/")) (set 'path (string (last (parse (real-path (last current)) {/})))) (if (not move-dummy) (set 'rename-result (rename-file ; old-name (string parent "/" path) ; rename with 'dup' in front? ; (string parent {/} (string {dup} path)) ; or move. This folder must exist... (string {duplicates/} path)))) (if move-dummy (println (dup { } 10) {You would have tried to rename } "\n" (dup { } 10) (string parent "/" path) "\n" (dup { } 10) { to } (string {duplicates/} path))) (println (dup { } 10) (if rename-result {You succeeded in renaming } {You didn't rename }) (dup { } 10) (string parent "/" path) "\n" (dup { } 10) { to } (string {duplicates/} path) "\n")) true) )) ; remember this one for the next comparison (set 'previous current))) (println "... finished") (exit)