summaryrefslogtreecommitdiff
path: root/jh-checksource.sh
blob: 9de91c0469763c8d0b5235898f0e08d6f625b4aa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
# Copyright © 2013-2014 Luke Shumaker <lukeshu@sbcglobal.net>
# This work is free. You can redistribute it and/or modify it under the
# terms of the Do What The Fuck You Want To Public License, Version 2,
# as published by Sam Hocevar. See the COPYING file for more details.

sep='<no-filename-ever-contains-this>'

safe_types_regexp=('^(inode|text|image|video|audio)/')
safe_types_string=('application/pdf' 'application/postscript' 'application/xml' 'application/ogg' 'message/rfc822')
safe_dirs_glob=(.{git,hg,svn} '*.git')
safe_files_regexp=('/po/[^/]*.gmo$' '\.flw$' '\.odg$' '\.ppt$')
safe_files_string=()

# don't care about files less than 3 bytes.
min_size=3

cwd="$(readlink -m -- "$PWD")"

normalize_filenames() {
	while IFS='' read -r filename; do
		readlink -m -- "$filename"
	done | sed "s|^$cwd/|./|"
}

matches_string() {
	local needle=$1
	shift
	for straw in "$@"; do
		if [[ "$needle" = "$straw" ]]; then
			return 0
		fi
	done
	return 1
}

matches_regexp() {
	local needle=$1
	shift
	for straw in "$@"; do
		if [[ "$needle" =~ $straw ]]; then
			return 0
		fi
	done
	return 1
}

print-human() {
	libremessages warning "The source directory %s contains binary files:" "$PWD"
	sed 's/^/  -> /'
}

print-machine() {
	cat
}

main() {
	local format=human
	# Parse arguments
	local arg
	for arg in "$@"; do
		case "$arg" in
			-m) format=machine;;
			*) safe_files_string+=("$(normalize_filenames <<<"$arg")");;
		esac
	done

	# Init
	local unsafe_files="$(mktemp --tmpdir "${0##*/}.XXXXXXXXXX")"
	trap "$(printf 'rm -f -- %q' "$unsafe_files")" EXIT

	# Heavy lifting
	local filter_dirs=()
	local glob
	for glob in "${safe_dirs_glob[@]}"; do
		filter_dirs+=(-type d -name "$glob" -prune -o)
	done
	find . "${filter_dirs[@]}" -type f -printf '%s %p\n' | # find all files
	while read -r size file; do # filter out files smaller than $min_size
		[[ $size -lt $min_size ]] || printf '%s\n' "$file"
	done |
	normalize_filenames |
	xargs -r -d'\n' file --mime-type -r -F "$sep" | # identify the filetypes
	sed -r "s@(.*)${sep}\s*(.*)@\2:\1@" |  # reformat the output to be easier to parse
	while IFS=: read -r type file; do
		declare -A cached_types
		if ! { matches_string "$file" "${safe_files_string[@]}" || \
		       matches_regexp "$file" "${safe_files_regexp[@]}" ;}; then
			if [[ -z ${cached_types[$type]} ]]; then
				if matches_string "$type" "${safe_types_string[@]}" || \
				   matches_regexp "$type" "${safe_types_regexp[@]}" ; then
					cached_types[$type]=false
				else
					cached_types[$type]=true
				fi
			fi
			if "${cached_types[$type]}"; then
				printf "%s\n" "$file"
			fi
		fi
	done > "$unsafe_files"

	if [[ "$(stat -c '%s' -- "$unsafe_files")" -gt 0 ]]; then
		<"$unsafe_files" sort | print-$format
		exit 1
	fi
}

main "$@"