#!/usr/bin/env ruby =begin post_fsd_wiki.phantomjs this script is used to validate and report statistics on the blacklist entries it currently detects syntax errors, missing tags, unspecified tags, missing descriptions, duplicate entries for a single package (partitioning them as identical or differing), and 'neglected_entries' which are those with no tag, no description, and no replacement it optionally creates a JSON file with the results that can be the input to the post_fsd_wiki.phantomjs script =end # DEBUG begin DEBUG = false require 'byebug' if DEBUG DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ] def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ; # DEBUG end require 'json' require 'set' # entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description BLACKLIST_FILES = [ 'blacklist.txt' ] # BLACKLIST_FILES = [ 'blacklist-testdata.txt' ] VALID_ENTRIES_REGEX = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/ BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/ RAW_ENTRY_KEY = :raw_blacklist_entry PACKAGE_NAME_KEY = :original_package # per blacklist SYNTAX REPLACEMENT_KEY = :libre_replacement # per blacklist SYNTAX REFERENCE_KEY = :ref # per blacklist SYNTAX ENTRY_ID_KEY = :id # per blacklist SYNTAX DESCRIPTION_KEY = :short_description # per blacklist SYNTAX BLACKLIST_TAGS_KEY = :blacklist_tags NONFREE_TAG = 'nonfree' SEMIFREE_TAG = 'semifree' USES_NONFREE_TAG = 'uses-nonfree' BRANDING_TAG = 'branding' TECHNICAL_TAG = 'technical' HAS_REPLACEMENT_TAG = 'FIXME:package' NEEDS_DESC_TAG = 'FIXME:description' ACCEPTABLE_TAGS = [ NONFREE_TAG , SEMIFREE_TAG , USES_NONFREE_TAG , BRANDING_TAG , TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG ] DO_PRINT_STATS = true DO_PRINT_INCOMPLETE = true DO_PRINT_DUPLICATE = false REPORT_SEPARATOR = "------------------------------------------------------------\n" OUTPUT_JSON_FILE = 'blacklist-data.json' entries_invalid = [] entries = [] entry_freqs = {} entries_no_desc = [] entries_no_tags = [] entries_unspecified_tags = [] unspecified_tags = Set[] duplicate_names = nil # deferred duplicate_identical_entries = {} duplicate_differing_entries = {} ## parse data ## BLACKLIST_FILES.each do | blacklist_filename | if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ; (File.readlines blacklist_filename).each do | line | # DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea } next if line.strip.empty? || (line.strip.start_with? '#') entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX entries << (entry = {}) tokens = (line.split ':') if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ; entry[RAW_ENTRY_KEY ] = line entry[PACKAGE_NAME_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[REPLACEMENT_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[REFERENCE_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[ENTRY_ID_KEY ] = (tokens.shift ).gsub("\t" , '').strip entry[DESCRIPTION_KEY ] = (tokens.join ':').gsub("\t" , '').strip entry[BLACKLIST_TAGS_KEY] = [] # parse tags while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']') if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc IN=#{entry[DESCRIPTION_KEY]}\n" ; end ; # debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY]) entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1') entry[DESCRIPTION_KEY ] = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2') if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ; end if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ; if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY ].empty? ; end ; end end ## process data ## entries.each do | entry | if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ; entry_name = entry[PACKAGE_NAME_KEY ] entry_desc = entry[DESCRIPTION_KEY ] entry_tags = entry[BLACKLIST_TAGS_KEY] entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS entry_freqs[entry_name] = (entry_freqs[entry_name] ||= 0) + 1 entries_no_desc << entry if entry_desc .empty? entries_no_tags << entry if entry_tags .empty? entries_unspecified_tags << entry unless entry_unspecified_tags.empty? unspecified_tags.merge entry_unspecified_tags end duplicate_names = entry_freqs.keys.select { | name | entry_freqs[name] > 1 } incomplete_entries = entries_no_desc + entries_no_tags unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? } neglected_entries = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set & unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set duplicate_names.each do | duplicate_name | # next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \ .map! { | entry | entry[RAW_ENTRY_KEY ] } unique_entries = duplicate_entries.uniq n_unique_entries = unique_entries.size unique_entries.each do | uniq_value | n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value } duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1 end if n_unique_entries > 1 duplicate_differing_entries[duplicate_name] = unique_entries end end ## report ## print REPORT_SEPARATOR print "entries found: #{ (entries + entries_invalid ).size }\n" if DO_PRINT_STATS print "entries valid: #{ (entries ).size }\n" if DO_PRINT_STATS print "entries_invalid: #{ (entries_invalid ).size }\n" if DO_PRINT_STATS print "entries lacking tags: #{ (entries_no_tags ).size }\n" if DO_PRINT_STATS print "entries lacking description: #{(entries_no_desc ).size }\n" if DO_PRINT_STATS print "unspecified tags: #{ (unspecified_tags ).size }\n" if DO_PRINT_STATS print "neglected entries: #{ (neglected_entries ).size }\n" if DO_PRINT_STATS print "duplicate_names: #{ (duplicate_names ).size }\n" if DO_PRINT_STATS print " identical entries: #{ (duplicate_identical_entries).size }\n" if DO_PRINT_STATS print " differing entries: #{ (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS if DO_PRINT_INCOMPLETE { 'invalid entries' => entries_invalid , 'entries lacking description' => entries_no_desc , 'entries lacking tags' => entries_no_tags , 'entries with unspecified tags' => entries_unspecified_tags , 'unspecified tags' => unspecified_tags }.each_pair do | label , data | print REPORT_SEPARATOR + "#{label}:\n" unless data.empty? data.each { | entry | print " #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" } end end unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE print REPORT_SEPARATOR + "neglected entries:\n" neglected_entries.each { | entry_name | print " #{entry_name}\n" } end unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE print REPORT_SEPARATOR + "duplicate entries:\n" duplicate_names.each do | duplicate_name | identical_entry = duplicate_identical_entries[duplicate_name] differing_entries = duplicate_differing_entries[duplicate_name] print "\n #{duplicate_name}:\n" print " identical entries:\n" unless identical_entry .nil? print " #{identical_entry}\n" unless identical_entry .nil? print " differing entries:\n" unless differing_entries.nil? differing_entries.each { | entry | print " #{entry}\n" } unless differing_entries.nil? end end ## sanity check ## should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?) (print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit ## generate JSON ## IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;