curl-curl/scripts/badwords
Viktor Szakats 435eabeac8
badwords: rework exceptions, fix many of them
Also:
- support per-directory and per-upper-directory whitelist entries.
- convert badlist input grep tweak into the above format.
  (except for 'And' which had just a few hits.)
- fix many code exceptions, but do not enforce.
  (there also remain about 350 'will' uses in lib)
- fix badwords in example code, drop exceptions.
- badwords-all: convert to Perl.
  To make it usable from CMake.
- FAQ: reword to not use 'will'. Drop exception.

Closes #20886
2026-03-12 01:01:16 +01:00

189 lines
4.1 KiB
Perl
Executable File

#!/usr/bin/env perl
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# bad[:=]correct
#
# If separator is '=', the string will be compared case sensitively.
# If separator is ':', the check is done case insensitively.
#
# To add white listed uses of bad words that are removed before checking for
# the bad ones:
#
# ---(accepted word)
#
use strict;
use warnings;
use File::Basename;
my @whitelist = (
# ignore what looks like URLs
'(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
# remove bolded sections
'\*\*.*?\*\*',
# remove backticked texts
'\`.*?\`'
);
my %alt;
my %exactcase;
my $skip_indented = 1;
if($ARGV[0] eq "-a") {
shift @ARGV;
$skip_indented = 0;
}
my %wl;
if($ARGV[0] eq "-w") {
shift @ARGV;
my $file = shift @ARGV;
open(W, "<$file") or die "Cannot open '$file': $!";
while(<W>) {
if(/^#/) {
# allow #-comments
next;
}
if(/^([^:]*):(\d*):(.*)/) {
$wl{"$1:$2:$3"}=1;
#print STDERR "whitelisted $1:$2:$3\n";
}
}
close(W);
}
my @w;
my @exact;
while(<STDIN>) {
chomp;
if($_ =~ /^#/) {
next;
}
if($_ =~ /^---(.+)/) {
push @whitelist, $1;
}
elsif($_ =~ /^(.*)([:=])(.*)/) {
my ($bad, $sep, $better)=($1, $2, $3);
if($sep eq "=") {
$alt{$bad} = $better;
push @exact, $bad;
}
else {
$alt{lc($bad)} = $better;
push @w, $bad;
}
}
}
# Build a single combined regex for case-insensitive words
my $re_ci;
if(@w) {
my $pat = join('|', map { quotemeta($_) } @w);
$re_ci = qr/\b($pat)\b/i;
}
# Build a single combined regex for case-sensitive (exact) words
my $re_cs;
if(@exact) {
my $pat = join('|', map { quotemeta($_) } @exact);
$re_cs = qr/\b($pat)\b/;
}
# Build a single combined regex for removing whitelisted content
my $re_wl;
my $pat = join('|', map { $_ } @whitelist);
$re_wl = qr/($pat)/;
my $errors = 0;
sub highlight {
my ($p, $w, $in, $f, $l, $lookup) = @_;
my $c = length($p)+1;
my $ch;
my $dir = dirname($f);
$ch = $dir . "/" . "::" . $w;
if($wl{$ch}) {
# whitelisted dirname + word
return;
}
my $updir = dirname($dir);
if($dir ne $updir) {
$ch = $updir . "/" . "::" . $w;
if($wl{$ch}) {
# whitelisted upper dirname + word
return;
}
}
$ch = $f . "::" . $w;
if($wl{$ch}) {
# whitelisted filename + word
return;
}
$ch = "$f:$l:$w";
if($wl{$ch}) {
# whitelisted filename + line + word
return;
}
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
printf STDERR " %4d | %s\n", $l, $in;
printf STDERR " | %*s^%s\n", length($p), " ",
"~" x (length($w)-1);
printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup};
$errors++;
}
sub file {
my ($f) = @_;
my $l = 0;
open(F, "<$f");
while(<F>) {
my $in = $_;
$l++;
chomp $in;
if($skip_indented && $in =~ /^ /) {
next;
}
# remove the link part
$in =~ s/(\[.*\])\(.*\)/$1/g;
# remove whitelisted patterns (pre-compiled)
if($re_wl) {
$in =~ s/${re_wl}//ig;
}
# case-insensitive bad words
if($re_ci) {
if($in =~ /^(.*)$re_ci/i) {
highlight($1, $2, $in, $f, $l, lc($2));
}
}
# case-sensitive (exact) bad words
if($re_cs) {
if($in =~ /^(.*)$re_cs/) {
highlight($1, $2, $in, $f, $l, $2);
}
}
}
close(F);
}
my @filemasks = @ARGV;
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
my @files;
while(my $each = <$git_ls_files>) {
chomp $each;
push @files, $each;
}
close $git_ls_files;
my $onum = scalar(@files);
my $num;
for my $e (@files) {
#printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
file($e);
}
exit $errors;