From 3bc47deab0ec8288c5bf3f01a6f881c1e17bda70 Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 20 Jun 2014 16:07:31 -0400 Subject: jh-checksource: optimize further --- jh-checksource.sh | 72 ++++++++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/jh-checksource.sh b/jh-checksource.sh index 9de91c0..5296226 100644 --- a/jh-checksource.sh +++ b/jh-checksource.sh @@ -4,16 +4,19 @@ # terms of the Do What The Fuck You Want To Public License, Version 2, # as published by Sam Hocevar. See the COPYING file for more details. -sep='' +# Regular expressions are POSIX EREs, and must match the entirety of the string -safe_types_regexp=('^(inode|text|image|video|audio)/') -safe_types_string=('application/pdf' 'application/postscript' 'application/xml' 'application/ogg' 'message/rfc822') safe_dirs_glob=(.{git,hg,svn} '*.git') -safe_files_regexp=('/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$') +safe_types_regexp=('(inode|text|image|video|audio)/.*' 'application/(pdf|postscript|xml|ogg)' message/rfc822) +safe_files_regexp=('.*/po/[^/]*.gmo' '.*\.(flw|odg|ppt)') safe_files_string=() -# don't care about files less than 3 bytes. -min_size=3 +# don't care about files less than 3 bytes ('c' is for characters) +min_size=3c + +sep='' + +################################################################################ cwd="$(readlink -m -- "$PWD")" @@ -23,28 +26,6 @@ normalize_filenames() { done | sed "s|^$cwd/|./|" } -matches_string() { - local needle=$1 - shift - for straw in "$@"; do - if [[ "$needle" = "$straw" ]]; then - return 0 - fi - done - return 1 -} - -matches_regexp() { - local needle=$1 - shift - for straw in "$@"; do - if [[ "$needle" =~ $straw ]]; then - return 0 - fi - done - return 1 -} - print-human() { libremessages warning "The source directory %s contains binary files:" "$PWD" sed 's/^/ -> /' @@ -69,36 +50,23 @@ main() { local unsafe_files="$(mktemp --tmpdir "${0##*/}.XXXXXXXXXX")" trap "$(printf 'rm -f -- %q' "$unsafe_files")" EXIT - # Heavy lifting - local filter_dirs=() + # Turn the variables up top into a bunch of `find(1)` filters + local filters=() local glob for glob in "${safe_dirs_glob[@]}"; do - filter_dirs+=(-type d -name "$glob" -prune -o) + filters+=(-type d -name "$glob" -prune -o) done - find . "${filter_dirs[@]}" -type f -printf '%s %p\n' | # find all files - while read -r size file; do # filter out files smaller than $min_size - [[ $size -lt $min_size ]] || printf '%s\n' "$file" - done | + filters+=(-type f -size +"${min_size}") + + # Heavy lifting + find . "${filters[@]}" -print | # find all files normalize_filenames | + grep -Fxvf <(printf '%s\n' "${safe_files_string[@]}") | + grep -Exvf <(printf '%s\n' "${safe_files_regexp[@]}") | xargs -r -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes sed -r "s@(.*)${sep}\s*(.*)@\2:\1@" | # reformat the output to be easier to parse - while IFS=: read -r type file; do - declare -A cached_types - if ! { matches_string "$file" "${safe_files_string[@]}" || \ - matches_regexp "$file" "${safe_files_regexp[@]}" ;}; then - if [[ -z ${cached_types[$type]} ]]; then - if matches_string "$type" "${safe_types_string[@]}" || \ - matches_regexp "$type" "${safe_types_regexp[@]}" ; then - cached_types[$type]=false - else - cached_types[$type]=true - fi - fi - if "${cached_types[$type]}"; then - printf "%s\n" "$file" - fi - fi - done > "$unsafe_files" + grep -Exvf <(printf '%s:.*\n' "${safe_types_regexp[@]}") | + cut -d: -f2- > "$unsafe_files" if [[ "$(stat -c '%s' -- "$unsafe_files")" -gt 0 ]]; then <"$unsafe_files" sort | print-$format -- cgit v1.2.2