From eca20f9c8263c51dfc1f1a3527daf9db10c09fc1 Mon Sep 17 00:00:00 2001
From: bill-auger <mr.j.spam.me@gmail.com>
Date: Sun, 14 Jan 2018 04:26:11 -0500
Subject: add report script

---
 report.rb | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)
 create mode 100755 report.rb

(limited to 'report.rb')

diff --git a/report.rb b/report.rb
new file mode 100755
index 0000000..4c3ff74
--- /dev/null
+++ b/report.rb
@@ -0,0 +1,201 @@
+#!/usr/bin/env ruby
+
+=begin
+  post_fsd_wiki.phantomjs
+
+    this script is used to validate and report statistics on the blacklist entries
+
+    it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
+    duplicate entries for a single package (partitioning them as identical or differing),
+    and 'neglected_entries' which are those with no tag, no description, and no replacement
+
+    it optionally creates a JSON file with the results
+    that can be the input to the post_fsd_wiki.phantomjs script
+=end
+
+
+# DEBUG begin
+DEBUG = false
+require 'byebug' if DEBUG
+DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
+def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
+# DEBUG end
+
+
+require 'json'
+require 'set'
+
+# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
+BLACKLIST_FILES      = [ 'blacklist.txt' ]
+# BLACKLIST_FILES      = [ 'blacklist-testdata.txt' ]
+VALID_ENTRIES_REGEX  = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
+BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
+RAW_ENTRY_KEY        = :raw_blacklist_entry
+PACKAGE_NAME_KEY     = :original_package    # per blacklist SYNTAX
+REPLACEMENT_KEY      = :libre_replacement   # per blacklist SYNTAX
+REFERENCE_KEY        = :ref                 # per blacklist SYNTAX
+ENTRY_ID_KEY         = :id                  # per blacklist SYNTAX
+DESCRIPTION_KEY      = :short_description   # per blacklist SYNTAX
+BLACKLIST_TAGS_KEY   = :blacklist_tags
+NONFREE_TAG          = 'nonfree'
+SEMIFREE_TAG         = 'semifree'
+USES_NONFREE_TAG     = 'uses-nonfree'
+BRANDING_TAG         = 'branding'
+TECHNICAL_TAG        = 'technical'
+HAS_REPLACEMENT_TAG  = 'FIXME:package'
+NEEDS_DESC_TAG       = 'FIXME:description'
+ACCEPTABLE_TAGS      = [ NONFREE_TAG   , SEMIFREE_TAG        , USES_NONFREE_TAG , BRANDING_TAG ,
+                         TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG                  ]
+DO_PRINT_STATS       = true
+DO_PRINT_INCOMPLETE  = true
+DO_PRINT_DUPLICATE   = false
+REPORT_SEPARATOR     = "------------------------------------------------------------\n"
+OUTPUT_JSON_FILE     = 'blacklist-data.json'
+
+entries_invalid             = []
+entries                     = []
+entry_freqs                 = {}
+entries_no_desc             = []
+entries_no_tags             = []
+entries_unspecified_tags    = []
+unspecified_tags            = Set[]
+duplicate_names             = nil # deferred
+duplicate_identical_entries = {}
+duplicate_differing_entries = {}
+
+
+## parse data ##
+
+BLACKLIST_FILES.each do | blacklist_filename |
+if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;
+
+  (File.readlines blacklist_filename).each do | line |
+# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }
+
+    next                            if     line.strip.empty? || (line.strip.start_with? '#')
+    entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX
+
+    entries << (entry = {})
+    tokens   = (line.split ':')
+
+if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;
+
+    entry[RAW_ENTRY_KEY     ] = line
+    entry[PACKAGE_NAME_KEY  ] = (tokens.shift   ).gsub("\t" , '').strip
+    entry[REPLACEMENT_KEY   ] = (tokens.shift   ).gsub("\t" , '').strip
+    entry[REFERENCE_KEY     ] = (tokens.shift   ).gsub("\t" , '').strip
+    entry[ENTRY_ID_KEY      ] = (tokens.shift   ).gsub("\t" , '').strip
+    entry[DESCRIPTION_KEY   ] = (tokens.join ':').gsub("\t" , '').strip
+    entry[BLACKLIST_TAGS_KEY] = []
+
+    # parse tags
+    while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
+if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc  IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
+# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])
+
+      entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
+      entry[DESCRIPTION_KEY   ]  = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')
+
+if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
+    end
+if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
+if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY   ].empty? ; end ;
+  end
+end
+
+
+## process data ##
+
+entries.each do | entry |
+if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;
+
+  entry_name             = entry[PACKAGE_NAME_KEY  ]
+  entry_desc             = entry[DESCRIPTION_KEY   ]
+  entry_tags             = entry[BLACKLIST_TAGS_KEY]
+  entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS
+
+  entry_freqs[entry_name]   = (entry_freqs[entry_name] ||= 0) + 1
+  entries_no_desc          << entry if     entry_desc            .empty?
+  entries_no_tags          << entry if     entry_tags            .empty?
+  entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
+  unspecified_tags.merge      entry_unspecified_tags
+end
+
+duplicate_names    = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
+incomplete_entries = entries_no_desc + entries_no_tags
+unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
+neglected_entries  = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
+                     unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set
+
+duplicate_names.each do | duplicate_name |
+# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG
+
+  duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
+                             .map!   { | entry | entry[RAW_ENTRY_KEY   ]                     }
+  unique_entries    = duplicate_entries.uniq
+  n_unique_entries  = unique_entries.size
+
+  unique_entries.each do | uniq_value |
+    n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
+    duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
+  end
+  if n_unique_entries > 1
+    duplicate_differing_entries[duplicate_name] = unique_entries
+  end
+end
+
+
+## report ##
+
+print REPORT_SEPARATOR
+print "entries found: #{              (entries + entries_invalid  ).size     }\n" if DO_PRINT_STATS
+print "entries valid: #{              (entries                    ).size     }\n" if DO_PRINT_STATS
+print "entries_invalid: #{            (entries_invalid            ).size     }\n" if DO_PRINT_STATS
+print "entries lacking tags: #{       (entries_no_tags            ).size     }\n" if DO_PRINT_STATS
+print "entries lacking description: #{(entries_no_desc            ).size     }\n" if DO_PRINT_STATS
+print "unspecified tags: #{           (unspecified_tags           ).size     }\n" if DO_PRINT_STATS
+print "neglected entries: #{          (neglected_entries          ).size     }\n" if DO_PRINT_STATS
+print "duplicate_names: #{            (duplicate_names            ).size     }\n" if DO_PRINT_STATS
+print "  identical entries: #{        (duplicate_identical_entries).size     }\n" if DO_PRINT_STATS
+print "  differing entries: #{        (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS
+
+if DO_PRINT_INCOMPLETE
+  { 'invalid entries'               => entries_invalid          ,
+    'entries lacking description'   => entries_no_desc          ,
+    'entries lacking tags'          => entries_no_tags          ,
+    'entries with unspecified tags' => entries_unspecified_tags ,
+    'unspecified tags'              => unspecified_tags         }.each_pair do | label , data |
+    print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
+    data.each { | entry | print "  #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
+  end
+end
+
+unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
+  print REPORT_SEPARATOR + "neglected entries:\n"
+  neglected_entries.each { | entry_name | print "  #{entry_name}\n" }
+end
+
+unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
+  print REPORT_SEPARATOR + "duplicate entries:\n"
+  duplicate_names.each do | duplicate_name |
+    identical_entry   = duplicate_identical_entries[duplicate_name]
+    differing_entries = duplicate_differing_entries[duplicate_name]
+
+    print                                    "\n  #{duplicate_name}:\n"
+    print                                    "    identical entries:\n"   unless identical_entry  .nil?
+    print                                    "      #{identical_entry}\n" unless identical_entry  .nil?
+    print                                    "    differing entries:\n"   unless differing_entries.nil?
+    differing_entries.each { | entry | print "      #{entry}\n"         } unless differing_entries.nil?
+  end
+end
+
+
+## sanity check ##
+
+should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
+(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit
+
+
+## generate JSON ##
+
+IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;
-- 
cgit v1.2.2