summaryrefslogtreecommitdiff
path: root/report.rb
blob: 4c3ff74513ac18c8cfc1c79e96b63866da05133c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env ruby

=begin
  post_fsd_wiki.phantomjs

    this script is used to validate and report statistics on the blacklist entries

    it currently detects syntax errors, missing tags, unspecified tags, missing descriptions,
    duplicate entries for a single package (partitioning them as identical or differing),
    and 'neglected_entries' which are those with no tag, no description, and no replacement

    it optionally creates a JSON file with the results
    that can be the input to the post_fsd_wiki.phantomjs script
=end


# DEBUG begin
DEBUG = false
require 'byebug' if DEBUG
DEBUG_FILTER_NAMES = [ 'vapoursynth-plugin-fluxsmooth' ]
def IS_DEBUG_FILTER_NAME name ; DEBUG_FILTER_NAMES.include? name ; end ;
# DEBUG end


require 'json'
require 'set'

# entry syntax => original-package:[libre-replacement]:[ref]:[id]:short-description
BLACKLIST_FILES      = [ 'blacklist.txt' ]
# BLACKLIST_FILES      = [ 'blacklist-testdata.txt' ]
VALID_ENTRIES_REGEX  = /^[^:\[\]#]*:[^:\[\]]*:(sv|debian|parabola|fsf|fedora)?:[^:\[\]]*:\w*([^\d:]+:.*|\[[^:]+:.*|[^:]*)$/
BLACKLIST_TAGS_REGEX = /^\[([^\]]*)\] *(.*)/
RAW_ENTRY_KEY        = :raw_blacklist_entry
PACKAGE_NAME_KEY     = :original_package    # per blacklist SYNTAX
REPLACEMENT_KEY      = :libre_replacement   # per blacklist SYNTAX
REFERENCE_KEY        = :ref                 # per blacklist SYNTAX
ENTRY_ID_KEY         = :id                  # per blacklist SYNTAX
DESCRIPTION_KEY      = :short_description   # per blacklist SYNTAX
BLACKLIST_TAGS_KEY   = :blacklist_tags
NONFREE_TAG          = 'nonfree'
SEMIFREE_TAG         = 'semifree'
USES_NONFREE_TAG     = 'uses-nonfree'
BRANDING_TAG         = 'branding'
TECHNICAL_TAG        = 'technical'
HAS_REPLACEMENT_TAG  = 'FIXME:package'
NEEDS_DESC_TAG       = 'FIXME:description'
ACCEPTABLE_TAGS      = [ NONFREE_TAG   , SEMIFREE_TAG        , USES_NONFREE_TAG , BRANDING_TAG ,
                         TECHNICAL_TAG , HAS_REPLACEMENT_TAG , NEEDS_DESC_TAG                  ]
DO_PRINT_STATS       = true
DO_PRINT_INCOMPLETE  = true
DO_PRINT_DUPLICATE   = false
REPORT_SEPARATOR     = "------------------------------------------------------------\n"
OUTPUT_JSON_FILE     = 'blacklist-data.json'

entries_invalid             = []
entries                     = []
entry_freqs                 = {}
entries_no_desc             = []
entries_no_tags             = []
entries_unspecified_tags    = []
unspecified_tags            = Set[]
duplicate_names             = nil # deferred
duplicate_identical_entries = {}
duplicate_differing_entries = {}


## parse data ##

BLACKLIST_FILES.each do | blacklist_filename |
if DEBUG ; print "\nDEBUG: parsing #{blacklist_filename}\n" ; end ;

  (File.readlines blacklist_filename).each do | line |
# DEBUG_FILTER_NAMES.each { | ea | debugger if line.start_with? ea }

    next                            if     line.strip.empty? || (line.strip.start_with? '#')
    entries_invalid << line && next unless line.match VALID_ENTRIES_REGEX

    entries << (entry = {})
    tokens   = (line.split ':')

if DEBUG ; tokens.each_with_index { | token , i | print "DEBUG: tokens[#{i}]=#{token}\n" if IS_DEBUG_FILTER_NAME tokens[0] } ; end ;

    entry[RAW_ENTRY_KEY     ] = line
    entry[PACKAGE_NAME_KEY  ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[REPLACEMENT_KEY   ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[REFERENCE_KEY     ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[ENTRY_ID_KEY      ] = (tokens.shift   ).gsub("\t" , '').strip
    entry[DESCRIPTION_KEY   ] = (tokens.join ':').gsub("\t" , '').strip
    entry[BLACKLIST_TAGS_KEY] = []

    # parse tags
    while (entry[DESCRIPTION_KEY].start_with? '[') && (entry[DESCRIPTION_KEY].include? ']')
if DEBUG ; print "\n parsing tag for: #{entry[PACKAGE_NAME_KEY]}\n" ; print "desc  IN=#{entry[DESCRIPTION_KEY]}\n" ; end ;
# debugger if DEBUG && (DEBUG_FILTER_NAMES.include? entry[PACKAGE_NAME_KEY])

      entry[BLACKLIST_TAGS_KEY] << (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\1')
      entry[DESCRIPTION_KEY   ]  = (entry[DESCRIPTION_KEY].gsub BLACKLIST_TAGS_REGEX , '\2')

if DEBUG ; print "desc OUT=#{entry[DESCRIPTION_KEY]}\n" ; print "tags=#{entry[BLACKLIST_TAGS_KEY]}\n" ; sleep 0.2 ; end ;
    end
if DEBUG ; print "\nno tag for: #{ entry[PACKAGE_NAME_KEY]}\n" if entry[BLACKLIST_TAGS_KEY].empty? ; end ;
if DEBUG ; print "\nno desc for: #{entry[PACKAGE_NAME_KEY]}\n" if entry[DESCRIPTION_KEY   ].empty? ; end ;
  end
end


## process data ##

entries.each do | entry |
if DEBUG && (IS_DEBUG_FILTER_NAME entry[PACKAGE_NAME_KEY]) ; print "\n" ; entry.each_pair { | k , v | print "DEBUG: #{k}: #{v}\n" } ; end ;

  entry_name             = entry[PACKAGE_NAME_KEY  ]
  entry_desc             = entry[DESCRIPTION_KEY   ]
  entry_tags             = entry[BLACKLIST_TAGS_KEY]
  entry_unspecified_tags = entry_tags - ACCEPTABLE_TAGS

  entry_freqs[entry_name]   = (entry_freqs[entry_name] ||= 0) + 1
  entries_no_desc          << entry if     entry_desc            .empty?
  entries_no_tags          << entry if     entry_tags            .empty?
  entries_unspecified_tags << entry unless entry_unspecified_tags.empty?
  unspecified_tags.merge      entry_unspecified_tags
end

duplicate_names    = entry_freqs.keys.select { | name | entry_freqs[name] > 1 }
incomplete_entries = entries_no_desc + entries_no_tags
unreplaced_entries = entries.select { | entry | entry[REPLACEMENT_KEY].empty? }
neglected_entries  = incomplete_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set &
                     unreplaced_entries.map { | entry | entry[PACKAGE_NAME_KEY] }.to_set

duplicate_names.each do | duplicate_name |
# next unless IS_DEBUG_FILTER_NAME duplicate_name ; p "duplicate_name=#{duplicate_name}" # DEBUG

  duplicate_entries = entries.select { | entry | entry[PACKAGE_NAME_KEY] == duplicate_name } \
                             .map!   { | entry | entry[RAW_ENTRY_KEY   ]                     }
  unique_entries    = duplicate_entries.uniq
  n_unique_entries  = unique_entries.size

  unique_entries.each do | uniq_value |
    n_identical_entries = duplicate_entries.count { | dup_entry | dup_entry == uniq_value }
    duplicate_identical_entries[duplicate_name] = uniq_value + " (#{n_identical_entries} identical)" if n_identical_entries > 1
  end
  if n_unique_entries > 1
    duplicate_differing_entries[duplicate_name] = unique_entries
  end
end


## report ##

print REPORT_SEPARATOR
print "entries found: #{              (entries + entries_invalid  ).size     }\n" if DO_PRINT_STATS
print "entries valid: #{              (entries                    ).size     }\n" if DO_PRINT_STATS
print "entries_invalid: #{            (entries_invalid            ).size     }\n" if DO_PRINT_STATS
print "entries lacking tags: #{       (entries_no_tags            ).size     }\n" if DO_PRINT_STATS
print "entries lacking description: #{(entries_no_desc            ).size     }\n" if DO_PRINT_STATS
print "unspecified tags: #{           (unspecified_tags           ).size     }\n" if DO_PRINT_STATS
print "neglected entries: #{          (neglected_entries          ).size     }\n" if DO_PRINT_STATS
print "duplicate_names: #{            (duplicate_names            ).size     }\n" if DO_PRINT_STATS
print "  identical entries: #{        (duplicate_identical_entries).size     }\n" if DO_PRINT_STATS
print "  differing entries: #{        (duplicate_differing_entries).keys.size}\n" if DO_PRINT_STATS

if DO_PRINT_INCOMPLETE
  { 'invalid entries'               => entries_invalid          ,
    'entries lacking description'   => entries_no_desc          ,
    'entries lacking tags'          => entries_no_tags          ,
    'entries with unspecified tags' => entries_unspecified_tags ,
    'unspecified tags'              => unspecified_tags         }.each_pair do | label , data |
    print REPORT_SEPARATOR + "#{label}:\n" unless data.empty?
    data.each { | entry | print "  #{((entry.is_a? Hash) ? entry[RAW_ENTRY_KEY] : entry).strip}\n" }
  end
end

unless neglected_entries.empty? || ! DO_PRINT_INCOMPLETE
  print REPORT_SEPARATOR + "neglected entries:\n"
  neglected_entries.each { | entry_name | print "  #{entry_name}\n" }
end

unless duplicate_names.empty? || ! DO_PRINT_DUPLICATE
  print REPORT_SEPARATOR + "duplicate entries:\n"
  duplicate_names.each do | duplicate_name |
    identical_entry   = duplicate_identical_entries[duplicate_name]
    differing_entries = duplicate_differing_entries[duplicate_name]

    print                                    "\n  #{duplicate_name}:\n"
    print                                    "    identical entries:\n"   unless identical_entry  .nil?
    print                                    "      #{identical_entry}\n" unless identical_entry  .nil?
    print                                    "    differing entries:\n"   unless differing_entries.nil?
    differing_entries.each { | entry | print "      #{entry}\n"         } unless differing_entries.nil?
  end
end


## sanity check ##

should_quit = ! (entries_invalid.empty? && unspecified_tags.empty? && duplicate_names.empty?)
(print "errors were found - JSON will not be generated\n" ; exit 1) if should_quit


## generate JSON ##

IO.write OUTPUT_JSON_FILE , entries.to_json ; print "\nwrote: #{OUTPUT_JSON_FILE}\n\ndone\n" ;