mirror of
https://github.com/curl/curl.git
synced 2026-04-11 12:01:42 +08:00
badwords: only check comments and strings in source code
- when scanning source code, this now only checks source code comments and double-quote strings. No more finding bad words as part of code - this allows the full scan to be done in a single invocation - detects source code or markdown by file name extension - moved the whitelist words config into the single `badwords.txt` file, no more having them separately (see top of file for syntax) - all whitelisted words are checked case insensitively now - removed support for whitelisting words on a specific line number. We did not use it and it is too fragile Removing the actual code from getting scanned made the script take an additional 0.5 seconds on my machine. Scanning 1525 files now takes a little under 1.7 seconds for me. Closes #20909
This commit is contained in:
parent
2b3438d486
commit
6870803187
@ -27,7 +27,7 @@ EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl \
|
||||
cdall cd2cd managen dmaketgz maketgz release-tools.sh verify-release \
|
||||
cmakelint.sh mdlinkcheck CMakeLists.txt perlcheck.sh pythonlint.sh \
|
||||
spacecheck.pl randdisable wcurl top-complexity extract-unit-protos \
|
||||
.checksrc badwords badwords-all badwords.ok badwords.txt
|
||||
.checksrc badwords badwords-all badwords.txt
|
||||
|
||||
dist_bin_SCRIPTS = wcurl
|
||||
|
||||
|
||||
214
scripts/badwords
214
scripts/badwords
@ -12,6 +12,7 @@
|
||||
# the bad ones:
|
||||
#
|
||||
# ---(accepted word)
|
||||
# ---:[path]:(accepted word)
|
||||
#
|
||||
|
||||
use strict;
|
||||
@ -19,6 +20,145 @@ use warnings;
|
||||
|
||||
use File::Basename;
|
||||
|
||||
#
|
||||
## States
|
||||
#
|
||||
# 0 - default, initial state
|
||||
# 1 - there was a slash
|
||||
# 2 - quoted string
|
||||
# 3 - // comment
|
||||
# 4 - /* comment
|
||||
# 5 - asterisk found within a /* comment
|
||||
# 6 - #include line
|
||||
# 7 - backslash in a string
|
||||
#
|
||||
## Flags
|
||||
#
|
||||
# 1 - include preprocessor line, ignore strings
|
||||
|
||||
sub srcline {
|
||||
my ($state, $flags, $l) = @_;
|
||||
my $line = "";
|
||||
|
||||
if(($state == 0) && ($l =~ /^ *\# *include/)) {
|
||||
# preprocessor include line
|
||||
$flags |= 1;
|
||||
}
|
||||
else {
|
||||
# not preprocessor
|
||||
$flags &= ~1;
|
||||
}
|
||||
|
||||
if($state == 3) {
|
||||
# // ended on the prev line, go back to init
|
||||
$state = 0;
|
||||
}
|
||||
|
||||
my @c = split(//, $l);
|
||||
|
||||
# state machine this line
|
||||
for my $c (@c) {
|
||||
if($state == 1) {
|
||||
# we had a slash
|
||||
if($c eq "/") {
|
||||
# // confirmed, the rest of the line is a comment
|
||||
$line .= "//";
|
||||
$state = 3;
|
||||
}
|
||||
elsif($c eq "*") {
|
||||
# /* confirmed
|
||||
$state = 4;
|
||||
$line .= "/*";
|
||||
}
|
||||
else {
|
||||
# back to normal
|
||||
$line .= " ";
|
||||
$state = 0;
|
||||
}
|
||||
}
|
||||
elsif($state == 2) {
|
||||
# a string
|
||||
if($c eq "\\") {
|
||||
$line .= "\\";
|
||||
$state = 7;
|
||||
}
|
||||
elsif($c eq "\"") {
|
||||
# end of the string
|
||||
$line .= "\"";
|
||||
$state = 0;
|
||||
}
|
||||
else {
|
||||
$line .= $c;
|
||||
}
|
||||
}
|
||||
elsif($state == 3) {
|
||||
# a // comment
|
||||
$line .= $c;
|
||||
}
|
||||
elsif($state == 4) {
|
||||
# a /* comment
|
||||
if($c eq "*") {
|
||||
# could be a comment close
|
||||
$state = 5;
|
||||
}
|
||||
else {
|
||||
$line .= $c;
|
||||
}
|
||||
}
|
||||
elsif($state == 5) {
|
||||
if($c eq "/") {
|
||||
# a /* */ comment ended here */
|
||||
$line .= "*/";
|
||||
$state = 0;
|
||||
}
|
||||
else {
|
||||
# the /* comment continues
|
||||
$line .= "*$c";
|
||||
$state = 4;
|
||||
}
|
||||
}
|
||||
elsif($state == 7) {
|
||||
# the prev was a backslash in a string
|
||||
$line .= $c;
|
||||
# switch back to normal string
|
||||
$state = 2;
|
||||
}
|
||||
else {
|
||||
if($c eq "/") {
|
||||
$state = 1; # got a slash
|
||||
}
|
||||
elsif(($c eq "\"") && !($flags & 1)) {
|
||||
# start of a string, not within a preprocessor line
|
||||
$line .= "\"";
|
||||
$state = 2;
|
||||
}
|
||||
elsif($c eq "\n") {
|
||||
$line .= "\n";
|
||||
}
|
||||
else {
|
||||
$line .= " ";
|
||||
}
|
||||
}
|
||||
}
|
||||
return $state, $flags, $line;
|
||||
}
|
||||
|
||||
sub sourcecode {
|
||||
my ($f) = @_;
|
||||
my $state = 0;
|
||||
my $flags = 0;
|
||||
my @lines;
|
||||
my $line;
|
||||
open(F, "<$f");
|
||||
while(<F>) {
|
||||
my $l = $_;
|
||||
($state, $flags, $line) = srcline($state, $flags, $l);
|
||||
push @lines, $line;
|
||||
}
|
||||
close(F);
|
||||
return @lines;
|
||||
}
|
||||
|
||||
my @whitelist = (
|
||||
# ignore what looks like URLs
|
||||
'(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
|
||||
@ -29,29 +169,8 @@ my @whitelist = (
|
||||
);
|
||||
my %alt;
|
||||
my %exactcase;
|
||||
my $skip_indented = 1;
|
||||
|
||||
if($ARGV[0] eq "-a") {
|
||||
shift @ARGV;
|
||||
$skip_indented = 0;
|
||||
}
|
||||
my %wl;
|
||||
if($ARGV[0] eq "-w") {
|
||||
shift @ARGV;
|
||||
my $file = shift @ARGV;
|
||||
open(W, "<$file") or die "Cannot open '$file': $!";
|
||||
while(<W>) {
|
||||
if(/^#/) {
|
||||
# allow #-comments
|
||||
next;
|
||||
}
|
||||
if(/^([^:]*):(\d*):(.*)/) {
|
||||
$wl{"$1:$2:$3"}=1;
|
||||
#print STDERR "whitelisted $1:$2:$3\n";
|
||||
}
|
||||
}
|
||||
close(W);
|
||||
}
|
||||
|
||||
my @w;
|
||||
my @exact;
|
||||
@ -60,7 +179,13 @@ while(<STDIN>) {
|
||||
if($_ =~ /^#/) {
|
||||
next;
|
||||
}
|
||||
if($_ =~ /^---(.+)/) {
|
||||
if(/^---:([^:]*):(.*)/) {
|
||||
# whitelist file + word
|
||||
my $word = lc($2);
|
||||
$wl{"$1:$word"}=1;
|
||||
}
|
||||
elsif($_ =~ /^---(.+)/) {
|
||||
# whitelist word
|
||||
push @whitelist, $1;
|
||||
}
|
||||
elsif($_ =~ /^(.*)([:=])(.*)/) {
|
||||
@ -104,29 +229,24 @@ sub highlight {
|
||||
my $ch;
|
||||
|
||||
my $dir = dirname($f);
|
||||
$ch = $dir . "/" . "::" . $w;
|
||||
$ch = $dir . "/" . ":" . lc($w);
|
||||
if($wl{$ch}) {
|
||||
# whitelisted dirname + word
|
||||
return;
|
||||
}
|
||||
my $updir = dirname($dir);
|
||||
if($dir ne $updir) {
|
||||
$ch = $updir . "/" . "::" . $w;
|
||||
$ch = $updir . "/" . ":" . lc($w);
|
||||
if($wl{$ch}) {
|
||||
# whitelisted upper dirname + word
|
||||
return;
|
||||
}
|
||||
}
|
||||
$ch = $f . "::" . $w;
|
||||
$ch = $f . ":" . lc($w);
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + word
|
||||
return;
|
||||
}
|
||||
$ch = "$f:$l:$w";
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + line + word
|
||||
return;
|
||||
}
|
||||
|
||||
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
|
||||
printf STDERR " %4d | %s\n", $l, $in;
|
||||
@ -136,12 +256,39 @@ sub highlight {
|
||||
$errors++;
|
||||
}
|
||||
|
||||
sub document {
|
||||
my ($f) = @_;
|
||||
my @lines;
|
||||
open(F, "<$f");
|
||||
while(<F>) {
|
||||
push @lines, $_;
|
||||
}
|
||||
close(F);
|
||||
return @lines;
|
||||
}
|
||||
|
||||
sub file {
|
||||
my ($f) = @_;
|
||||
my $l = 0;
|
||||
open(F, "<$f");
|
||||
while(<F>) {
|
||||
my $in = $_;
|
||||
|
||||
my $skip_indented = 0;
|
||||
my $source_code = 0;
|
||||
if($f =~ /\.[ch]$/) {
|
||||
$source_code = 1;
|
||||
}
|
||||
else {
|
||||
# markdown
|
||||
$skip_indented = 1;
|
||||
}
|
||||
|
||||
my @lines;
|
||||
if($source_code) {
|
||||
@lines = sourcecode($f);
|
||||
}
|
||||
else {
|
||||
@lines = document($f);
|
||||
}
|
||||
for my $in (@lines) {
|
||||
$l++;
|
||||
chomp $in;
|
||||
if($skip_indented && $in =~ /^ /) {
|
||||
@ -166,7 +313,6 @@ sub file {
|
||||
}
|
||||
}
|
||||
}
|
||||
close(F);
|
||||
}
|
||||
|
||||
my @filemasks = @ARGV;
|
||||
|
||||
@ -10,5 +10,4 @@ use File::Basename;
|
||||
|
||||
chdir dirname(__FILE__) . "/..";
|
||||
|
||||
system("scripts/badwords -a -w scripts/badwords.ok src lib include docs/examples < scripts/badwords.txt");
|
||||
system("scripts/badwords -w scripts/badwords.ok '**.md' projects/OS400/README.OS400 < scripts/badwords.txt");
|
||||
system("scripts/badwords '**.md' projects/OS400/README.OS400 src lib include docs/examples < scripts/badwords.txt");
|
||||
|
||||
@ -1,15 +0,0 @@
|
||||
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
|
||||
#
|
||||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
# whitelisted uses of bad words
|
||||
# file:[line]:rule
|
||||
lib/urldata.h:: url
|
||||
include/curl/::will
|
||||
lib/::But
|
||||
lib/::So
|
||||
lib/::will
|
||||
lib/::Will
|
||||
lib/::WILL
|
||||
src/::will
|
||||
src/::Will
|
||||
@ -2,6 +2,12 @@
|
||||
#
|
||||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
# whitelisted uses of bad words (case insensitive) can be done in two ways,
|
||||
# globally and per-file.
|
||||
#
|
||||
# ---[word]
|
||||
# ---:[file]:[word]
|
||||
#
|
||||
back-end:backend
|
||||
e-mail:email
|
||||
run-time:runtime
|
||||
@ -95,11 +101,14 @@ Curl=curl
|
||||
cURL=curl
|
||||
Libcurl=libcurl
|
||||
LibCurl=libcurl
|
||||
---WWW::Curl
|
||||
---NET::Curl
|
||||
---Curl Corporation
|
||||
manpages:man pages
|
||||
manpage:man page
|
||||
favour:favor
|
||||
basically:rephrase?
|
||||
However,:rephrase?
|
||||
---WWW::Curl
|
||||
---NET::Curl
|
||||
---Curl Corporation
|
||||
---:include/curl/:will
|
||||
---:lib/:will
|
||||
---:src/:will
|
||||
|
||||
Loading…
Reference in New Issue
Block a user