Removed my oh-so-clever "shadowdir" and "shadowfile" concepts for
the directory to be examined for potential deduplication. Instead
we call them subject directories and subject files, because they
are subject to our process.
Now creates a temporary file, removed on exit, as our temporary
access and modification time holder. When replacing a file with a
hard link, we first copy the atime/mtime of its directory to the
temporary file. THEN we perform the link. After the link, we
restore the original atime/mtime of the enclosing directory from
the temporary file. This is useful for backups, where we do not
want the directories to have newer timestamps (timestamps matching
the date/time that we run deduplication, because creating the link
counts as a modification of the directory).
110 lines
4.1 KiB
Bash
Executable File
110 lines
4.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# Copyright (c) 2023 Joshua Lee Ockert <torstenvl@gmail.com>
|
|
#
|
|
# THIS WORK IS PROVIDED "AS IS" WITH NO WARRANTY OF ANY KIND. THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY, FITNESS, NON-INFRINGEMENT, AND TITLE ARE
|
|
# EXPRESSLY DISCLAIMED. NO AUTHOR SHALL BE LIABLE UNDER ANY THEORY OF LAW
|
|
# FOR ANY DAMAGES OF ANY KIND RESULTING FROM THE USE OF THIS WORK.
|
|
#
|
|
# Permission to use, copy, modify, and/or distribute this work for any
|
|
# purpose is hereby granted, provided this notice appears in all copies.
|
|
|
|
|
|
############################################################################
|
|
## FUNCTION TO PRINT USAGE INSTRUCTIONS ##
|
|
############################################################################
|
|
printusage() {
|
|
echo "
|
|
DIRECTORY DE-DUPLICATOR
|
|
|
|
USAGE
|
|
|
|
dirdedupe.sh [--execute] masterdir subjectdir
|
|
|
|
DESCRIPTION
|
|
|
|
For each file in subjectdir, replace it with a hard link to the matching
|
|
file (if any) in masterdir. A file will be considered a match if, and
|
|
only if, it shares the same file name, relative path, and contents.
|
|
|
|
OPTIONS
|
|
|
|
--execute Actually remove and link duplicate files. By default, this
|
|
program runs in test mode.
|
|
|
|
"
|
|
}
|
|
|
|
|
|
############################################################################
|
|
## PARSE & MAKE SENSE OF COMMAND LINE ##
|
|
############################################################################
|
|
|
|
# CHECK FOR THE EXECUTE FLAG (DEFAULT IS TESTING-ONLY MODE)
|
|
REALLYRUN=0
|
|
if [ "${1}" == "--execute" ]; then
|
|
REALLYRUN=1
|
|
shift
|
|
fi
|
|
|
|
# MAKE SURE WE HAVE THE RIGHT NUMBER OF ARGUMENTS AND THEY'RE VALID
|
|
if [ ! $# -eq 2 ]; then
|
|
echo && echo "Wrong number of arguments!" && printusage && exit
|
|
else
|
|
masterdir=$1
|
|
subjectdir=$2
|
|
if [ ! -d "${masterdir}" ]; then
|
|
echo && echo "${masterdir} is not a directory!" && printusage && exit
|
|
elif [ ! -d "${subjectdir}" ]; then
|
|
echo && echo "${subjectdir} is not a directory!" && printusage && exit
|
|
fi
|
|
fi
|
|
|
|
############################################################################
|
|
## MAKE A TEMPORARY FILE FOR PRESERVING TIMESTAMPS ##
|
|
############################################################################
|
|
TEMPFILE = $(mktemp)
|
|
trap "rm -f ${TEMPFILE}" EXIT
|
|
|
|
############################################################################
|
|
## HARDLINK THE DUPLICATES (OR NOT) ##
|
|
############################################################################
|
|
find "${subjectdir}" -print0 | while read -d $'\0' subjectfile
|
|
do
|
|
if [ -f "${subjectfile}" ]; then
|
|
masterfile="${subjectfile/#${subjectdir}/${masterdir}}"
|
|
if [ -f "${masterfile}" ]; then
|
|
if [ ! "${subjectfile}" -ef "${masterfile}" ]; then
|
|
cmp -s "${masterfile}" "${subjectfile}"
|
|
if [ $? -eq 0 ]; then
|
|
if [ $REALLYRUN -gt 0 ]; then
|
|
echo "LINK \"${masterfile}\" <-- \"${subjectfile}\""
|
|
# Store the mtime/atime of subject file's directory
|
|
TEMPSUBJDIR=`dirname "${subjectfile}"`
|
|
#touch -r "${TEMPSUBJDIR}" "${TEMPFILE}"
|
|
# Link the subject file to the corresponding file in
|
|
# the master directory
|
|
ln -Pf "${masterfile}" "${subjectfile}"
|
|
# Restore the mtime/atime of subject file's directory
|
|
#touch -r "${TEMPFILE}" "${TEMPSUBJDIR}"
|
|
else
|
|
echo "HYPO \"${masterfile}\" <~~ \"${subjectfile}\""
|
|
TEMPSUBJDIR=`dirname "${subjectfile}"`
|
|
echo " Saving atime/mtime of |${TEMPSUBJDIR}|"
|
|
fi
|
|
#else
|
|
#echo "MOD \"${masterfile}\" <X> \"${subjectfile}\""
|
|
fi #END check for files being the same
|
|
#else
|
|
#echo "ID \"${masterfile}\" <-> \"${subjectfile}\""
|
|
fi # END check for inode equality
|
|
#else
|
|
#echo "NEW \"${subjectfile}\" "
|
|
fi # END check if master file exists
|
|
fi
|
|
done
|
|
|
|
exit
|
|
|