From eca20f9c8263c51dfc1f1a3527daf9db10c09fc1 Mon Sep 17 00:00:00 2001 From: bill-auger Date: Sun, 14 Jan 2018 04:26:11 -0500 Subject: add report script --- report.rb | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100755 report.rb (limited to 'report.rb') diff --git a/report.rb b/report.rb new file mode 100755 index 0000000..4c3ff74 --- /dev/null +++ b/report.rb @@ -0,0 +1,201 @@ +#!/usr/bin/env ruby + +=begin + post_fsd_wiki.phantomjs + + this script is used to validate and report statistics on the blacklist entries + + it currently detects syntax errors, missing tags, unspecified tags, missing descriptions, + duplicate entries for a single package (partitioning them as identical or differing), + and 'neglected_entries' which are those with no tag, no description, and no replacement + + it optionally creates a JSON file with the results + that can be the input to the post_fsd_wiki.phantomjs script +=end + + +# DEBUG begin +DEBUG = false +require 'byebug' if DEBUG +DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ] +def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ; +# DEBUG end + + +require 'json' +require 'set' + +# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description +BLACKLIST_FILES = [ 'blacklist.txt' ] +# BLACKLIST_FILES = [ 'blacklist-testdata.txt' ] +VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/ +BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/ +RAW_ENTRY_KEY = :raw_blacklist_entry +PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX +REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX +REFERENCE_KEY = :ref # per blacklist SYNTAX +ENTRY_ID_KEY = :id # per blacklist SYNTAX +DESCRIPTION_KEY = :short_description # per blacklist SYNTAX +BLACKLIST_TAGS_KEY = :blacklist_tags +NONFREE_TAG = 'nonfree' +SEMIFREE_TAG = 'semifree' +USES_NONFREE_TAG = 'uses-nonfree' +BRANDING_TAG = 'branding' +TECHNICAL_TAG = 'technical' +HAS_REPLACEMENT_TAG = 'FIXME:package' +NEEDS_DESC_TAG = 'FIXME:description' +ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG , + TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ] +DO_PRINT_STATS = true +DO_PRINT_INCOMPLETE = true +DO_PRINT_DUPLICATE = false +REPORT_SEPARATOR = "------------------------------------------------------------\n" +OUTPUT_JSON_FILE = 'blacklist-data.json' + +entries_invalid = [] +entries = [] +entry_freqs = {} +entries_no_desc = [] +entries_no_tags = [] +entries_unspecified_tags = [] +unspecified_tags = Set[] +duplicate_names = nil # deferred +duplicate_identical_entries = {} +duplicate_differing_entries = {} + + +## parse data ## + +BLACKLIST_FILES.each do | blacklist_filename | +if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ; + + (File.readlines blacklist_filename).each do | line | +# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea } + + next if line.strip.empty? || (line.strip.start_with? '#') + entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX + + entries << (entry = {}) + tokens = (line.split ':') + +if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ; + + entry[RAW_ENTRY_KEY ] = line + entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip + entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip + entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip + entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip + entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip + entry[BLACKLIST_TAGS_KEY] = [] + + # parse tags + while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']') +if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ; +# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY]) + + entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1') + entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2') + +if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ; + end +if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ; +if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ; + end +end + + +## process data ## + +entries.each do | entry | +if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ; + + entry_name = entry[PACKAGE_NAME_KEY ] + entry_desc = entry[DESCRIPTION_KEY ] + entry_tags = entry[BLACKLIST_TAGS_KEY] + entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS + + entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1 + entries_no_desc << entry if entry_desc .empty? + entries_no_tags << entry if entry_tags .empty? + entries_unspecified_tags << entry unless entry_unspecified_tags.empty? + unspecified_tags.merge entry_unspecified_tags +end + +duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 } +incomplete_entries = entries_no_desc + entries_no_tags +unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? } +neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set & + unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set + +duplicate_names.each do | duplicate_name | +# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG + + duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \ + .map! { | entry | entry[RAW_ENTRY_KEY ] } + unique_entries = duplicate_entries.uniq + n_unique_entries = unique_entries.size + + unique_entries.each do | uniq_value | + n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value } + duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1 + end + if n_unique_entries > 1 + duplicate_differing_entries[duplicate_name] = unique_entries + end +end + + +## report ## + +print REPORT_SEPARATOR +print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS +print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS +print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS +print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS +print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS +print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS +print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS +print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS +print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS +print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS + +if DO_PRINT_INCOMPLETE + { 'invalid entries' => entries_invalid , + 'entries lacking description' => entries_no_desc , + 'entries lacking tags' => entries_no_tags , + 'entries with unspecified tags' => entries_unspecified_tags , + 'unspecified tags' => unspecified_tags }.each_pair do | label , data | + print REPORT_SEPARATOR + "#{label}:\n" unless data.empty? + data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" } + end +end + +unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE + print REPORT_SEPARATOR + "neglected entries:\n" + neglected_entries.each { | entry_name | print " #{entry_name}\n" } +end + +unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE + print REPORT_SEPARATOR + "duplicate entries:\n" + duplicate_names.each do | duplicate_name | + identical_entry = duplicate_identical_entries[duplicate_name] + differing_entries = duplicate_differing_entries[duplicate_name] + + print "\n #{duplicate_name}:\n" + print " identical entries:\n" unless identical_entry .nil? + print " #{identical_entry}\n" unless identical_entry .nil? + print " differing entries:\n" unless differing_entries.nil? + differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil? + end +end + + +## sanity check ## + +should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?) +(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit + + +## generate JSON ## + +IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ; -- cgit v1.2.2