summaryrefslogtreecommitdiff
path: root/report.rb
diff options
context:
space:
mode:
authorbill-auger <mr.j.spam.me@gmail.com>2018-01-14 04:26:11 -0500
committerbill-auger <mr.j.spam.me@gmail.com>2018-02-13 13:29:34 -0500
commiteca20f9c8263c51dfc1f1a3527daf9db10c09fc1 (patch)
tree8b3d2de240d0e9f53492630b110317acb5b99033 /report.rb
parent04fb3a23ed8373a511053624a679c0564d7e99fd (diff)
add report script
Diffstat (limited to 'report.rb')
-rwxr-xr-xreport.rb201
1 files changed, 201 insertions, 0 deletions
diff --git a/report.rb b/report.rb
new file mode 100755
index 0000000..4c3ff74
--- /dev/null
+++ b/report.rb
@@ -0,0 +1,201 @@
+#!/usr/bin/env ruby
+
+=begin
+ post_fsd_wiki.phantomjs
+
+ this script is used to validate and report statistics on the blacklist entries
+
+ it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
+ duplicate entries for a single package (partitioning them as identical or differing),
+ and 'neglected_entries' which are those with no tag, no description, and no replacement
+
+ it optionally creates a JSON file with the results
+ that can be the input to the post_fsd_wiki.phantomjs script
+=end
+
+
+# DEBUG begin
+DEBUG = false
+require 'byebug' if DEBUG
+DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
+def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
+# DEBUG end
+
+
+require 'json'
+require 'set'
+
+# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
+BLACKLIST_FILES = [ 'blacklist.txt' ]
+# BLACKLIST_FILES = [ 'blacklist-testdata.txt' ]
+VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
+BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
+RAW_ENTRY_KEY = :raw_blacklist_entry
+PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX
+REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX
+REFERENCE_KEY = :ref # per blacklist SYNTAX
+ENTRY_ID_KEY = :id # per blacklist SYNTAX
+DESCRIPTION_KEY = :short_description # per blacklist SYNTAX
+BLACKLIST_TAGS_KEY = :blacklist_tags
+NONFREE_TAG = 'nonfree'
+SEMIFREE_TAG = 'semifree'
+USES_NONFREE_TAG = 'uses-nonfree'
+BRANDING_TAG = 'branding'
+TECHNICAL_TAG = 'technical'
+HAS_REPLACEMENT_TAG = 'FIXME:package'
+NEEDS_DESC_TAG = 'FIXME:description'
+ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG ,
+ TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ]
+DO_PRINT_STATS = true
+DO_PRINT_INCOMPLETE = true
+DO_PRINT_DUPLICATE = false
+REPORT_SEPARATOR = "------------------------------------------------------------\n"
+OUTPUT_JSON_FILE = 'blacklist-data.json'
+
+entries_invalid = []
+entries = []
+entry_freqs = {}
+entries_no_desc = []
+entries_no_tags = []
+entries_unspecified_tags = []
+unspecified_tags = Set[]
+duplicate_names = nil # deferred
+duplicate_identical_entries = {}
+duplicate_differing_entries = {}
+
+
+## parse data ##
+
+BLACKLIST_FILES.each do | blacklist_filename |
+if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;
+
+ (File.readlines blacklist_filename).each do | line |
+# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }
+
+ next if line.strip.empty? || (line.strip.start_with? '#')
+ entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX
+
+ entries << (entry = {})
+ tokens = (line.split ':')
+
+if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;
+
+ entry[RAW_ENTRY_KEY ] = line
+ entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip
+ entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip
+ entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip
+ entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip
+ entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip
+ entry[BLACKLIST_TAGS_KEY] = []
+
+ # parse tags
+ while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
+if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
+# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])
+
+ entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
+ entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')
+
+if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
+ end
+if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
+if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ;
+ end
+end
+
+
+## process data ##
+
+entries.each do | entry |
+if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;
+
+ entry_name = entry[PACKAGE_NAME_KEY ]
+ entry_desc = entry[DESCRIPTION_KEY ]
+ entry_tags = entry[BLACKLIST_TAGS_KEY]
+ entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS
+
+ entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1
+ entries_no_desc << entry if entry_desc .empty?
+ entries_no_tags << entry if entry_tags .empty?
+ entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
+ unspecified_tags.merge entry_unspecified_tags
+end
+
+duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
+incomplete_entries = entries_no_desc + entries_no_tags
+unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
+neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
+ unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set
+
+duplicate_names.each do | duplicate_name |
+# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG
+
+ duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
+ .map! { | entry | entry[RAW_ENTRY_KEY ] }
+ unique_entries = duplicate_entries.uniq
+ n_unique_entries = unique_entries.size
+
+ unique_entries.each do | uniq_value |
+ n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
+ duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
+ end
+ if n_unique_entries > 1
+ duplicate_differing_entries[duplicate_name] = unique_entries
+ end
+end
+
+
+## report ##
+
+print REPORT_SEPARATOR
+print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS
+print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS
+print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS
+print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS
+print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS
+print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS
+print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS
+print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS
+print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS
+print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS
+
+if DO_PRINT_INCOMPLETE
+ { 'invalid entries' => entries_invalid ,
+ 'entries lacking description' => entries_no_desc ,
+ 'entries lacking tags' => entries_no_tags ,
+ 'entries with unspecified tags' => entries_unspecified_tags ,
+ 'unspecified tags' => unspecified_tags }.each_pair do | label , data |
+ print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
+ data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
+ end
+end
+
+unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
+ print REPORT_SEPARATOR + "neglected entries:\n"
+ neglected_entries.each { | entry_name | print " #{entry_name}\n" }
+end
+
+unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
+ print REPORT_SEPARATOR + "duplicate entries:\n"
+ duplicate_names.each do | duplicate_name |
+ identical_entry = duplicate_identical_entries[duplicate_name]
+ differing_entries = duplicate_differing_entries[duplicate_name]
+
+ print "\n #{duplicate_name}:\n"
+ print " identical entries:\n" unless identical_entry .nil?
+ print " #{identical_entry}\n" unless identical_entry .nil?
+ print " differing entries:\n" unless differing_entries.nil?
+ differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil?
+ end
+end
+
+
+## sanity check ##
+
+should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
+(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit
+
+
+## generate JSON ##
+
+IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;