curl-curl/scripts/badwords
Daniel Stenberg 6870803187
badwords: only check comments and strings in source code
- when scanning source code, this now only checks source code comments
  and double-quote strings. No more finding bad words as part of code
- this allows the full scan to be done in a single invocation
- detects source code or markdown by file name extension
- moved the whitelist words config into the single `badwords.txt` file,
  no more having them separately (see top of file for syntax)
- all whitelisted words are checked case insensitively now
- removed support for whitelisting words on a specific line number. We
  did not use it and it is too fragile

Removing the actual code from getting scanned made the script take an
additional 0.5 seconds on my machine.

Scanning 1525 files now takes a little under 1.7 seconds for me.

Closes #20909
2026-03-13 08:54:35 +01:00

335 lines
7.2 KiB
Perl
Executable File

#!/usr/bin/env perl
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# bad[:=]correct
#
# If separator is '=', the string will be compared case sensitively.
# If separator is ':', the check is done case insensitively.
#
# To add white listed uses of bad words that are removed before checking for
# the bad ones:
#
# ---(accepted word)
# ---:[path]:(accepted word)
#
use strict;
use warnings;
use File::Basename;
#
## States
#
# 0 - default, initial state
# 1 - there was a slash
# 2 - quoted string
# 3 - // comment
# 4 - /* comment
# 5 - asterisk found within a /* comment
# 6 - #include line
# 7 - backslash in a string
#
## Flags
#
# 1 - include preprocessor line, ignore strings
sub srcline {
my ($state, $flags, $l) = @_;
my $line = "";
if(($state == 0) && ($l =~ /^ *\# *include/)) {
# preprocessor include line
$flags |= 1;
}
else {
# not preprocessor
$flags &= ~1;
}
if($state == 3) {
# // ended on the prev line, go back to init
$state = 0;
}
my @c = split(//, $l);
# state machine this line
for my $c (@c) {
if($state == 1) {
# we had a slash
if($c eq "/") {
# // confirmed, the rest of the line is a comment
$line .= "//";
$state = 3;
}
elsif($c eq "*") {
# /* confirmed
$state = 4;
$line .= "/*";
}
else {
# back to normal
$line .= " ";
$state = 0;
}
}
elsif($state == 2) {
# a string
if($c eq "\\") {
$line .= "\\";
$state = 7;
}
elsif($c eq "\"") {
# end of the string
$line .= "\"";
$state = 0;
}
else {
$line .= $c;
}
}
elsif($state == 3) {
# a // comment
$line .= $c;
}
elsif($state == 4) {
# a /* comment
if($c eq "*") {
# could be a comment close
$state = 5;
}
else {
$line .= $c;
}
}
elsif($state == 5) {
if($c eq "/") {
# a /* */ comment ended here */
$line .= "*/";
$state = 0;
}
else {
# the /* comment continues
$line .= "*$c";
$state = 4;
}
}
elsif($state == 7) {
# the prev was a backslash in a string
$line .= $c;
# switch back to normal string
$state = 2;
}
else {
if($c eq "/") {
$state = 1; # got a slash
}
elsif(($c eq "\"") && !($flags & 1)) {
# start of a string, not within a preprocessor line
$line .= "\"";
$state = 2;
}
elsif($c eq "\n") {
$line .= "\n";
}
else {
$line .= " ";
}
}
}
return $state, $flags, $line;
}
sub sourcecode {
my ($f) = @_;
my $state = 0;
my $flags = 0;
my @lines;
my $line;
open(F, "<$f");
while(<F>) {
my $l = $_;
($state, $flags, $line) = srcline($state, $flags, $l);
push @lines, $line;
}
close(F);
return @lines;
}
my @whitelist = (
# ignore what looks like URLs
'(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
# remove bolded sections
'\*\*.*?\*\*',
# remove backticked texts
'\`.*?\`'
);
my %alt;
my %exactcase;
my %wl;
my @w;
my @exact;
while(<STDIN>) {
chomp;
if($_ =~ /^#/) {
next;
}
if(/^---:([^:]*):(.*)/) {
# whitelist file + word
my $word = lc($2);
$wl{"$1:$word"}=1;
}
elsif($_ =~ /^---(.+)/) {
# whitelist word
push @whitelist, $1;
}
elsif($_ =~ /^(.*)([:=])(.*)/) {
my ($bad, $sep, $better)=($1, $2, $3);
if($sep eq "=") {
$alt{$bad} = $better;
push @exact, $bad;
}
else {
$alt{lc($bad)} = $better;
push @w, $bad;
}
}
}
# Build a single combined regex for case-insensitive words
my $re_ci;
if(@w) {
my $pat = join('|', map { quotemeta($_) } @w);
$re_ci = qr/\b($pat)\b/i;
}
# Build a single combined regex for case-sensitive (exact) words
my $re_cs;
if(@exact) {
my $pat = join('|', map { quotemeta($_) } @exact);
$re_cs = qr/\b($pat)\b/;
}
# Build a single combined regex for removing whitelisted content
my $re_wl;
my $pat = join('|', map { $_ } @whitelist);
$re_wl = qr/($pat)/;
my $errors = 0;
sub highlight {
my ($p, $w, $in, $f, $l, $lookup) = @_;
my $c = length($p)+1;
my $ch;
my $dir = dirname($f);
$ch = $dir . "/" . ":" . lc($w);
if($wl{$ch}) {
# whitelisted dirname + word
return;
}
my $updir = dirname($dir);
if($dir ne $updir) {
$ch = $updir . "/" . ":" . lc($w);
if($wl{$ch}) {
# whitelisted upper dirname + word
return;
}
}
$ch = $f . ":" . lc($w);
if($wl{$ch}) {
# whitelisted filename + word
return;
}
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
printf STDERR " %4d | %s\n", $l, $in;
printf STDERR " | %*s^%s\n", length($p), " ",
"~" x (length($w)-1);
printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup};
$errors++;
}
sub document {
my ($f) = @_;
my @lines;
open(F, "<$f");
while(<F>) {
push @lines, $_;
}
close(F);
return @lines;
}
sub file {
my ($f) = @_;
my $l = 0;
my $skip_indented = 0;
my $source_code = 0;
if($f =~ /\.[ch]$/) {
$source_code = 1;
}
else {
# markdown
$skip_indented = 1;
}
my @lines;
if($source_code) {
@lines = sourcecode($f);
}
else {
@lines = document($f);
}
for my $in (@lines) {
$l++;
chomp $in;
if($skip_indented && $in =~ /^ /) {
next;
}
# remove the link part
$in =~ s/(\[.*\])\(.*\)/$1/g;
# remove whitelisted patterns (pre-compiled)
if($re_wl) {
$in =~ s/${re_wl}//ig;
}
# case-insensitive bad words
if($re_ci) {
if($in =~ /^(.*)$re_ci/i) {
highlight($1, $2, $in, $f, $l, lc($2));
}
}
# case-sensitive (exact) bad words
if($re_cs) {
if($in =~ /^(.*)$re_cs/) {
highlight($1, $2, $in, $f, $l, $2);
}
}
}
}
my @filemasks = @ARGV;
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
my @files;
while(my $each = <$git_ls_files>) {
chomp $each;
push @files, $each;
}
close $git_ls_files;
my $onum = scalar(@files);
my $num;
for my $e (@files) {
#printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
file($e);
}
exit $errors;