From 0f501979af323d087561acf4944dd1ddedaa0d05 Mon Sep 17 00:00:00 2001 From: bill-auger Date: Wed, 11 Mar 2020 02:23:52 -0400 Subject: [report] refactor report script for per-file operation and reports --- blacklist-testdata.txt | 14 -- report | 337 ++++++++++++++++++++++++++++--------------------- 2 files changed, 195 insertions(+), 156 deletions(-) diff --git a/blacklist-testdata.txt b/blacklist-testdata.txt index 232d502..93f9d38 100644 --- a/blacklist-testdata.txt +++ b/blacklist-testdata.txt @@ -1,19 +1,5 @@ # comments begin with '#' -##------ expected results ------## -# entries found: 20 # -# entries valid: 14 # -# entries_invalid: 6 # -# entries lacking tags: 2 # -# entries lacking description: 3 # -# unspecified tags: 3 # -# neglected entries: 4 # -# duplicate_names: 2 # -# identical entries: 1 # -# differing entries: 1 # -##------------------------------## - - valid-complete:replacement:parabola:42:[branding]valid complete valid-no-desc::::[branding] valid-no-tags::::valid no tags diff --git a/report b/report index 1685317..17a32ee 100755 --- a/report +++ b/report @@ -1,201 +1,254 @@ #!/usr/bin/env ruby =begin - post_fsd_wiki.phantomjs + report.rb this script is used to validate and report statistics on the blacklist entries - it currently detects syntax errors, missing tags, unspecified tags, missing descriptions, - duplicate entries for a single package (partitioning them as identical or differing), - and 'neglected_entries' which are those with no tag, no description, and no replacement + it currently detects syntax errors, missing tags, unknown tags, missing descriptions, + duplicate entries for a single package (partitioning them as identical or differing), + and will fail if any of those conditions are unmet + it also detects entries with no replacement, although that is not an error it optionally creates a JSON file with the results that can be the input to the post_fsd_wiki.phantomjs script =end -# DEBUG begin +## DEBUG begin ## DEBUG = false require 'byebug' if DEBUG -DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ] -def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ; -# DEBUG end +DEBUG_FILTER_NAMES = [] +def IS_DEBUG_FILTER_NAME name ; DEBUG && (DEBUG_FILTER_NAMES.include? name) ; end ; + +def DBG_PARSE input_filename ; if DEBUG ; print "\nDEBUG: parsing #{input_filename}\n" ; end ; end ; +def DBG_FILTER_NAME line ; if DEBUG ; DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea } ; end ; end ; +def DBG_TOKENS tokens ; if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ; end ; +def DBG_TAG entry ; if DEBUG ; print "\nparsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ; end ; +def DBG_DESC entry ; if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ; end ; +def DBG_NO_TAG entry ; if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ; end ; +def DBG_NO_DESC entry ; if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ; end ; +def DBG_ENTRY entry ; if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ; end ; +def DBG_EXPECTED input_filename ; if input_filename == 'blacklist-testdata.txt' + print "##------ expected results ------##\n" + + "# entries found: 20 #\n" + + "# entries valid: 14 #\n" + + "# entries invalid: 6 #\n" + + "# entries lacking tags: 2 #\n" + + "# entries lacking description: 3 #\n" + + "# tags unknown: 3 #\n" + + "# entries unreplaced: 13 #\n" + + "# entries duplicated: 2 #\n" + + "# identical: 1 #\n" + + "# differing: 1 #\n" + + "##------------------------------##\n" ; end ; end ; +## DEBUG end ## require 'json' require 'set' -# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description -BLACKLIST_FILES = [ 'blacklist.txt' ] -# BLACKLIST_FILES = [ 'blacklist-testdata.txt' ] -VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/ -BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/ -RAW_ENTRY_KEY = :raw_blacklist_entry -PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX -REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX -REFERENCE_KEY = :ref # per blacklist SYNTAX -ENTRY_ID_KEY = :id # per blacklist SYNTAX -DESCRIPTION_KEY = :short_description # per blacklist SYNTAX -BLACKLIST_TAGS_KEY = :blacklist_tags -NONFREE_TAG = 'nonfree' -SEMIFREE_TAG = 'semifree' -USES_NONFREE_TAG = 'uses-nonfree' -BRANDING_TAG = 'branding' -TECHNICAL_TAG = 'technical' -HAS_REPLACEMENT_TAG = 'FIXME:package' -NEEDS_DESC_TAG = 'FIXME:description' -ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG , - TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ] -DO_PRINT_STATS = true -DO_PRINT_INCOMPLETE = true -DO_PRINT_DUPLICATE = false -REPORT_SEPARATOR = "------------------------------------------------------------\n" -OUTPUT_JSON_FILE = 'blacklist-data.json' - -entries_invalid = [] -entries = [] -entry_freqs = {} -entries_no_desc = [] -entries_no_tags = [] -entries_unspecified_tags = [] -unspecified_tags = Set[] -duplicate_names = nil # deferred -duplicate_identical_entries = {} -duplicate_differing_entries = {} - - -## parse data ## - -BLACKLIST_FILES.each do | blacklist_filename | -if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ; - - (File.readlines blacklist_filename).each do | line | -# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea } - - next if line.strip.empty? || (line.strip.start_with? '#') - entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX - - entries << (entry = {}) - tokens = (line.split ':') - -if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ; +# NOTE: acceptable entry syntax per SYNTAX doc => +# ORIGINAL_PACKAGE:LIBRE_REPLACEMENT:REF:REF_ID:[TAG] SHORT_DESCRIPTION +ALL_BLACKLIST_FILES = [ 'blacklist.txt' , 'your-freedom_emu-blacklist.txt' , 'your-privacy-blacklist.txt' ] +BLACKLIST_FILES = (ALL_BLACKLIST_FILES.include? ARGV.first) ? [ ARGV.first ] : ALL_BLACKLIST_FILES +# BLACKLIST_FILES = [ 'blacklist-testdata.txt' ] # DEBUG +VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/ +BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/ +RAW_ENTRY_KEY = :raw_blacklist_entry +PACKAGE_NAME_KEY = :original_package # syntax token +REPLACEMENT_KEY = :libre_replacement # syntax token +REFERENCE_KEY = :ref # syntax token +REFERENCE_ID_KEY = :id # syntax token +DESCRIPTION_KEY = :short_description # syntax token +BLACKLIST_TAGS_KEY = :blacklist_tags +NONFREE_TAG = 'nonfree' +SEMIFREE_TAG = 'semifree' +USES_NONFREE_TAG = 'uses-nonfree' +BRANDING_TAG = 'branding' +TECHNICAL_TAG = 'technical' +NEEDS_REPLACEMENT_TAG = 'FIXME:package' +NEEDS_DESC_TAG = 'FIXME:description' +ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG , + TECHNICAL_TAG , NEEDS_REPLACEMENT_TAG , NEEDS_DESC_TAG ] +DO_PRINT_STATS = true +DO_PRINT_INVALID = true +DO_PRINT_UNREPLACED = true +DO_PRINT_INCOMPLETE = true +DO_PRINT_DUPLICATED = true +REPORT_SEPARATOR = "------------------------------------------------------------\n" + + +def reset_state + @entries_invalid = [] + @entries_valid = [] + @entry_freqs = {} + @entries_no_desc = [] + @entries_no_tags = [] + @entries_tags_unknown = [] + @tags_unknown = Set[] + @entries_unreplaced = nil # deferred + @entries_duplicated = nil # deferred + @entries_duplicated_identical = {} + @entries_duplicated_differing = {} + @errors = [] +end + +def parse_entries input_filename +DBG_PARSE input_filename + + (File.readlines input_filename).each do | line | + +DBG_FILTER_NAME line + + next if line.strip.empty? || (line.strip.start_with? '#') + @entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX + + @entries_valid << (entry = {}) + tokens = (line.split ':') entry[RAW_ENTRY_KEY ] = line entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip - entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip + entry[REFERENCE_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip entry[BLACKLIST_TAGS_KEY] = [] +DBG_TOKENS tokens + # parse tags while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']') -if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ; -# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY]) + +DBG_TAG entry entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1') entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2') -if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ; +DBG_DESC entry + end -if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ; -if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ; +DBG_NO_TAG entry +DBG_NO_DESC entry end end +def process_entries + @entries_valid.each do | entry | -## process data ## +DBG_ENTRY entry -entries.each do | entry | -if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ; + entry_name = entry[PACKAGE_NAME_KEY ] + entry_desc = entry[DESCRIPTION_KEY ] + entry_tags = entry[BLACKLIST_TAGS_KEY] + entry_tags_unknown = entry_tags - ACCEPTABLE_TAGS - entry_name = entry[PACKAGE_NAME_KEY ] - entry_desc = entry[DESCRIPTION_KEY ] - entry_tags = entry[BLACKLIST_TAGS_KEY] - entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS - - entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1 - entries_no_desc << entry if entry_desc .empty? - entries_no_tags << entry if entry_tags .empty? - entries_unspecified_tags << entry unless entry_unspecified_tags.empty? - unspecified_tags.merge entry_unspecified_tags -end - -duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 } -incomplete_entries = entries_no_desc + entries_no_tags -unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? } -neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set & - unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set + @entry_freqs[entry_name] = (@entry_freqs[entry_name] ||= 0) + 1 + @entries_no_desc << entry if entry_desc .empty? + @entries_no_tags << entry if entry_tags .empty? + @entries_tags_unknown << entry unless entry_tags_unknown.empty? + @tags_unknown.merge entry_tags_unknown + end -duplicate_names.each do | duplicate_name | -# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG + @entries_unreplaced = @entries_valid .select { | entry | entry[REPLACEMENT_KEY].empty? } + @entries_duplicated = @entry_freqs.keys.select { | name | @entry_freqs[name] > 1 } - duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \ - .map! { | entry | entry[RAW_ENTRY_KEY ] } - unique_entries = duplicate_entries.uniq - n_unique_entries = unique_entries.size + @entries_duplicated.each do | duplicate_name | + duplicate_entries = @entries_valid.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \ + .map! { | entry | entry[RAW_ENTRY_KEY ] } + unique_entries = duplicate_entries.uniq + n_unique_entries = unique_entries.size - unique_entries.each do | uniq_value | - n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value } - duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1 - end - if n_unique_entries > 1 - duplicate_differing_entries[duplicate_name] = unique_entries + unique_entries.each do | uniq_value | + n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value } + @entries_duplicated_identical[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1 + end + if n_unique_entries > 1 + @entries_duplicated_differing[duplicate_name] = unique_entries + end end end +def print_report input_filename + if DO_PRINT_INVALID || DO_PRINT_INCOMPLETE || DO_PRINT_UNREPLACED + print "\n\n#{REPORT_SEPARATOR}#{input_filename} report:\n" + end -## report ## - -print REPORT_SEPARATOR -print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS -print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS -print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS -print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS -print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS -print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS -print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS -print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS -print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS -print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS - -if DO_PRINT_INCOMPLETE - { 'invalid entries' => entries_invalid , - 'entries lacking description' => entries_no_desc , - 'entries lacking tags' => entries_no_tags , - 'entries with unspecified tags' => entries_unspecified_tags , - 'unspecified tags' => unspecified_tags }.each_pair do | label , data | - print REPORT_SEPARATOR + "#{label}:\n" unless data.empty? + print_invalid = {} + print_invalid['entries invalid' ] = @entries_invalid if DO_PRINT_INVALID + print_invalid['entries lacking description'] = @entries_no_desc if DO_PRINT_INCOMPLETE + print_invalid['entries lacking tags' ] = @entries_no_tags if DO_PRINT_INCOMPLETE + print_invalid['entries with unknown tags' ] = @entries_tags_unknown if DO_PRINT_INCOMPLETE + print_invalid['tags unknown' ] = @tags_unknown if DO_PRINT_INCOMPLETE + print_invalid.each_pair do | label , data | + print "#{REPORT_SEPARATOR}#{label}:\n" unless data.empty? data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" } end -end -unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE - print REPORT_SEPARATOR + "neglected entries:\n" - neglected_entries.each { | entry_name | print " #{entry_name}\n" } -end + unless @entries_unreplaced.empty? || ! DO_PRINT_UNREPLACED + print "#{REPORT_SEPARATOR}entries unreplaced:\n" + @entries_unreplaced.each { | entry | print " #{entry[PACKAGE_NAME_KEY]}\n" } + end + + unless @entries_duplicated.empty? || ! DO_PRINT_DUPLICATED + print "#{REPORT_SEPARATOR}entries duplicates:\n" + @entries_duplicated.each do | duplicate_name | + entry_identical = @entries_duplicated_identical[duplicate_name] + entries_differing = @entries_duplicated_differing[duplicate_name] + + print "\n #{duplicate_name}:\n" + print " identical:\n" unless entry_identical .nil? + print " #{entry_identical}\n" unless entry_identical .nil? + print " differing:\n" unless entries_differing.nil? + entries_differing.each { | entry | print " #{entry}\n" } unless entries_differing.nil? + end + end -unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE - print REPORT_SEPARATOR + "duplicate entries:\n" - duplicate_names.each do | duplicate_name | - identical_entry = duplicate_identical_entries[duplicate_name] - differing_entries = duplicate_differing_entries[duplicate_name] - - print "\n #{duplicate_name}:\n" - print " identical entries:\n" unless identical_entry .nil? - print " #{identical_entry}\n" unless identical_entry .nil? - print " differing entries:\n" unless differing_entries.nil? - differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil? + if DO_PRINT_STATS + print "#{REPORT_SEPARATOR}#{input_filename} stats:\n" + print " entries found: #{ (@entries_valid + @entries_invalid).size }\n" + print " entries valid: #{ (@entries_valid ).size }\n" + print " entries invalid: #{ (@entries_invalid ).size }\n" + print " entries lacking tags: #{ (@entries_no_tags ).size }\n" + print " entries lacking description: #{(@entries_no_desc ).size }\n" + print " tags unknown: #{ (@tags_unknown ).size }\n" + print " entries unreplaced: #{ (@entries_unreplaced ).size }\n" + print " entries duplicated: #{ (@entries_duplicated ).size }\n" + print " identical: #{ (@entries_duplicated_identical ).size }\n" + print " differing: #{ (@entries_duplicated_differing ).keys.size}\n" + print REPORT_SEPARATOR end + +DBG_EXPECTED input_filename end +def sanity_check input_filename + @errors << 'entries_invalid' unless @entries_invalid .empty? + # @errors << 'entries_no_tags' unless @entries_no_tags .empty? # TODO: complete these entries + # @errors << 'entries_no_desc' unless @entries_no_desc .empty? # TODO: complete these entries + @errors << 'tags_unknown' unless @tags_unknown .empty? + @errors << 'entries_duplicated' unless @entries_duplicated.empty? +end -## sanity check ## +def generate_json input_filename + output_json_file = "#{input_filename}.json" -should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?) -(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit + if @errors.empty? + IO.write output_json_file , @entries_valid.to_json + print "\nwrote: #{output_json_file}\n\nno problems detected in #{input_filename}\n" + else + print "\nERROR: in #{input_filename} - #{@errors.join ','} - JSON will not be generated\n" + end +end -## generate JSON ## +BLACKLIST_FILES.each do | input_filename | + reset_state + parse_entries input_filename + process_entries + print_report input_filename + sanity_check input_filename + generate_json input_filename -IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ; + exit 1 unless @errors.empty? +end -- cgit v1.2.2