#!/usr/bin/env ruby

=begin
  post_fsd_wiki.phantomjs

    this script is used to validate and report statistics on the blacklist entries

    it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
    duplicate entries for a single package (partitioning them as identical or differing),
    and 'neglected_entries' which are those with no tag, no description, and no replacement

    it optionally creates a JSON file with the results
    that can be the input to the post_fsd_wiki.phantomjs script
=end


# DEBUG begin
DEBUG = false
require 'byebug' if DEBUG
DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
# DEBUG end


require 'json'
require 'set'

# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
BLACKLIST_FILES      = [ 'blacklist.txt' ]
# BLACKLIST_FILES      = [ 'blacklist-testdata.txt' ]
VALID_ENTRIES_REGEX  = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
RAW_ENTRY_KEY        = :raw_blacklist_entry
PACKAGE_NAME_KEY     = :original_package    # per blacklist SYNTAX
REPLACEMENT_KEY      = :libre_replacement   # per blacklist SYNTAX
REFERENCE_KEY        = :ref                 # per blacklist SYNTAX
ENTRY_ID_KEY         = :id                  # per blacklist SYNTAX
DESCRIPTION_KEY      = :short_description   # per blacklist SYNTAX
BLACKLIST_TAGS_KEY   = :blacklist_tags
NONFREE_TAG          = 'nonfree'
SEMIFREE_TAG         = 'semifree'
USES_NONFREE_TAG     = 'uses-nonfree'
BRANDING_TAG         = 'branding'
TECHNICAL_TAG        = 'technical'
HAS_REPLACEMENT_TAG  = 'FIXME:package'
NEEDS_DESC_TAG       = 'FIXME:description'
ACCEPTABLE_TAGS      = [ NONFREE_TAG   , SEMIFREE_TAG        , USES_NONFREE_TAG , BRANDING_TAG ,
                         TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG                  ]
DO_PRINT_STATS       = true
DO_PRINT_INCOMPLETE  = true
DO_PRINT_DUPLICATE   = false
REPORT_SEPARATOR     = "------------------------------------------------------------\n"
OUTPUT_JSON_FILE     = 'blacklist-data.json'

entries_invalid             = []
entries                     = []
entry_freqs                 = {}
entries_no_desc             = []
entries_no_tags             = []
entries_unspecified_tags    = []
unspecified_tags            = Set[]
duplicate_names             = nil # deferred
duplicate_identical_entries = {}
duplicate_differing_entries = {}


## parse data ##

BLACKLIST_FILES.each do | blacklist_filename |
if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;

  (File.readlines blacklist_filename).each do | line |
# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }

    next                            if     line.strip.empty? || (line.strip.start_with? '#')
    entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX

    entries << (entry = {})
    tokens   = (line.split ':')

if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;

    entry[RAW_ENTRY_KEY     ] = line
    entry[PACKAGE_NAME_KEY  ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[REPLACEMENT_KEY   ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[REFERENCE_KEY     ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[ENTRY_ID_KEY      ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[DESCRIPTION_KEY   ] = (tokens.join ':').gsub("\t" , '').strip
    entry[BLACKLIST_TAGS_KEY] = []

    # parse tags
    while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc  IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])

      entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
      entry[DESCRIPTION_KEY   ]  = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')

if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
    end
if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY   ].empty? ; end ;
  end
end


## process data ##

entries.each do | entry |
if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;

  entry_name             = entry[PACKAGE_NAME_KEY  ]
  entry_desc             = entry[DESCRIPTION_KEY   ]
  entry_tags             = entry[BLACKLIST_TAGS_KEY]
  entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS

  entry_freqs[entry_name]   = (entry_freqs[entry_name] ||= 0) + 1
  entries_no_desc          << entry if     entry_desc            .empty?
  entries_no_tags          << entry if     entry_tags            .empty?
  entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
  unspecified_tags.merge      entry_unspecified_tags
end

duplicate_names    = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
incomplete_entries = entries_no_desc + entries_no_tags
unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
neglected_entries  = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
                     unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set

duplicate_names.each do | duplicate_name |
# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG

  duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
                             .map!   { | entry | entry[RAW_ENTRY_KEY   ]                   }
  unique_entries    = duplicate_entries.uniq
  n_unique_entries  = unique_entries.size

  unique_entries.each do | uniq_value |
    n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
    duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
  end
  if n_unique_entries > 1
    duplicate_differing_entries[duplicate_name] = unique_entries
  end
end


## report ##

print REPORT_SEPARATOR
print "entries found: #{              (entries + entries_invalid  ).size     }\n" if DO_PRINT_STATS
print "entries valid: #{              (entries                    ).size     }\n" if DO_PRINT_STATS
print "entries_invalid: #{            (entries_invalid            ).size     }\n" if DO_PRINT_STATS
print "entries lacking tags: #{       (entries_no_tags            ).size     }\n" if DO_PRINT_STATS
print "entries lacking description: #{(entries_no_desc            ).size     }\n" if DO_PRINT_STATS
print "unspecified tags: #{           (unspecified_tags           ).size     }\n" if DO_PRINT_STATS
print "neglected entries: #{          (neglected_entries          ).size     }\n" if DO_PRINT_STATS
print "duplicate_names: #{            (duplicate_names            ).size     }\n" if DO_PRINT_STATS
print "  identical entries: #{        (duplicate_identical_entries).size     }\n" if DO_PRINT_STATS
print "  differing entries: #{        (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS

if DO_PRINT_INCOMPLETE
  { 'invalid entries'               => entries_invalid          ,
    'entries lacking description'   => entries_no_desc          ,
    'entries lacking tags'          => entries_no_tags          ,
    'entries with unspecified tags' => entries_unspecified_tags ,
    'unspecified tags'              => unspecified_tags         }.each_pair do | label , data |
    print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
    data.each { | entry | print "  #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
  end
end

unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
  print REPORT_SEPARATOR + "neglected entries:\n"
  neglected_entries.each { | entry_name | print "  #{entry_name}\n" }
end

unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
  print REPORT_SEPARATOR + "duplicate entries:\n"
  duplicate_names.each do | duplicate_name |
    identical_entry   = duplicate_identical_entries[duplicate_name]
    differing_entries = duplicate_differing_entries[duplicate_name]

    print                                    "\n  #{duplicate_name}:\n"
    print                                    "    identical entries:\n"   unless identical_entry  .nil?
    print                                    "      #{identical_entry}\n" unless identical_entry  .nil?
    print                                    "    differing entries:\n"   unless differing_entries.nil?
    differing_entries.each { | entry | print "      #{entry}\n"         } unless differing_entries.nil?
  end
end


## sanity check ##

should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit


## generate JSON ##

IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;