badwords: move into ./scripts, speed up

- 'badwords' is now a target in Makefile.am

- change badwords.txt to specify plain "words" instead of regexes so the
  script can build single regexes when scanning, which makes the script
  perform much faster (~6 times faster)

Closes #20869
This commit is contained in:
Daniel Stenberg 2026-03-09 13:32:14 +01:00
parent 248dd9e55f
commit 713287188e
No known key found for this signature in database
GPG key ID: 5CC908FDB71E12C2
7 changed files with 123 additions and 78 deletions

View file

@ -124,7 +124,7 @@ jobs:
persist-credentials: false
- name: 'badwords'
run: .github/scripts/badwords.pl -w .github/scripts/badwords.ok '**.md' projects/OS400/README.OS400 < .github/scripts/badwords.txt
run: ./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt
- name: 'verify synopsis'
run: .github/scripts/verify-synopsis.pl docs/libcurl/curl*.md

View file

@ -181,4 +181,4 @@ jobs:
- name: 'badwords'
run: |
# we allow some extra in source code
grep -Ev '(\\bwill| But: | So : )' .github/scripts/badwords.txt | .github/scripts/badwords.pl -a src lib include docs/examples
grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples

View file

@ -181,3 +181,7 @@ tidy:
clean-local:
(cd tests && $(MAKE) clean)
badwords:
grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples
./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt

View file

@ -22,11 +22,12 @@
#
###########################################################################
EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl checksrc-all.pl \
mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd cdall cd2cd managen \
dmaketgz maketgz release-tools.sh verify-release cmakelint.sh mdlinkcheck \
CMakeLists.txt perlcheck.sh pythonlint.sh spacecheck.pl randdisable wcurl \
top-complexity extract-unit-protos .checksrc
EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl \
checksrc-all.pl mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd \
cdall cd2cd managen dmaketgz maketgz release-tools.sh verify-release \
cmakelint.sh mdlinkcheck CMakeLists.txt perlcheck.sh pythonlint.sh \
spacecheck.pl randdisable wcurl top-complexity extract-unit-protos \
.checksrc badwords badwords.ok badwords.txt
dist_bin_SCRIPTS = wcurl

View file

@ -17,7 +17,11 @@
use strict;
use warnings;
my @whitelist;
my @whitelist = (
# ignore what looks like URLs
'(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
# ignore bolded sections
'\*\*(.*?)\*\*');
my %alt;
my %exactcase;
my $skip_indented = 1;
@ -45,6 +49,7 @@ if($ARGV[0] eq "-w") {
}
my @w;
my @exact;
while(<STDIN>) {
chomp;
if($_ =~ /^#/) {
@ -55,16 +60,55 @@ while(<STDIN>) {
}
elsif($_ =~ /^(.*)([:=])(.*)/) {
my ($bad, $sep, $better)=($1, $2, $3);
push @w, $bad;
$alt{$bad} = $better;
if($sep eq "=") {
$exactcase{$bad} = 1;
push @exact, $bad;
}
else {
push @w, $bad;
}
}
}
# Build a single combined regex for case-insensitive words
my $re_ci;
if(@w) {
my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @w);
$re_ci = qr/($pat)/i;
}
# Build a single combined regex for case-sensitive (exact) words
my $re_cs;
if(@exact) {
my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @exact);
$re_cs = qr/($pat)/;
}
my $errors = 0;
sub highlight {
my ($p, $w, $in, $f, $l) = @_;
my $c = length($p)+1;
my $ch = "$f:$l:$w";
if($wl{$ch}) {
# whitelisted filename + line + word
return;
}
$ch = $f . "::" . $w;
if($wl{$ch}) {
# whitelisted filename + word
return;
}
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
printf STDERR " %4d | %s\n", $l, $in;
printf STDERR " | %*s^%s\n", length($p), " ",
"~" x (length($w)-1);
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
$errors++;
}
sub file {
my ($f) = @_;
my $l = 0;
@ -80,36 +124,22 @@ sub file {
$in =~ s/(\[.*\])\(.*\)/$1/g;
# remove backticked texts
$in =~ s/\`.*\`//g;
# remove whitelisted patterns
# remove whitelisted patterns (pre-compiled)
for my $p (@whitelist) {
$in =~ s/$p//g;
}
foreach my $w (@w) {
my $case = $exactcase{$w};
if(($in =~ /^(.*)$w/i && !$case) ||
($in =~ /^(.*)$w/ && $case) ) {
my $p = $1;
my $c = length($p)+1;
my $ch = "$f:$l:$w";
if($wl{$ch}) {
# whitelisted filename + line + word
#print STDERR "$ch found but whitelisted\n";
next;
# case-insensitive bad words
if($re_ci) {
while($in =~ /^(.*)$re_ci/i) {
highlight($1, $2, $in, $f, $l);
last;
}
$ch = $f . "::" . $w;
if($wl{$ch}) {
# whitelisted filename + word
#print STDERR "$ch found but whitelisted\n";
next;
}
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
printf STDERR " %4d | %s\n", $l, $in;
printf STDERR " | %*s^%s\n", length($p), " ",
"~" x (length($w)-1);
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
$errors++;
# case-sensitive (exact) bad words
if($re_cs) {
while($in =~ /^(.*)$re_cs/) {
highlight($1, $2, $in, $f, $l);
last;
}
}
}
@ -118,9 +148,18 @@ sub file {
my @filemasks = @ARGV;
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
my @files;
while(my $each = <$git_ls_files>) {
chomp $each;
file($each);
push @files, $each;
}
close $git_ls_files;
my $onum = scalar(@files);
my $num;
for my $e (@files) {
#printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
file($e);
}
exit $errors;

View file

@ -4,4 +4,5 @@
#
# whitelisted uses of bad words
# file:[line]:rule
docs/FAQ.md::\bwill\b
docs/FAQ.md::will
docs/FAQ.md::Will

View file

@ -3,17 +3,17 @@
# SPDX-License-Identifier: curl
#
back-end:backend
\be-mail[^/]:email
e-mail:email
run-time:runtime
set-up:setup
tool chain:toolchain
tool-chain:toolchain
wild-card:wildcard
wild card:wildcard
\bthread ?safe[^."t]:thread-safe
\bthread ?unsafe[^."t]:thread-unsafe
multi ?thread:multi-thread
\bit's:it is
thread safe:thread-safe
thread unsafe:thread-unsafe
multi thread:multi-thread
it's:it is
aren't:are not
can't:cannot
could've:could have
@ -58,40 +58,40 @@ you've:you have
a ftp:an ftp
a IPv4:an IPv4
a IPv6:an IPv6
url [^=]=URL
[^/]internet\b=Internet
url= URL
internet=Internet
isation:ization
[^.]\. And: Rewrite it somehow?
^(And|So|But) = Rewrite it somehow?
\. But: Rewrite it somehow?
\. So : Rewrite without "so" ?
dir [^=]=directory
Dir [^=]=Directory
sub-director:subdirector
So=Rewrite it somehow?
And=Rewrite it somehow?
But=Rewrite it somehow?
sub-directory:subdirectory
web page:webpage
host name\b:hostname
host names\b:hostnames
[^;<]file name\b:filename
file names\b:filenames
\bfist\b:first
\buser name\b:username
\buser names\b:usernames
\bpass phrase:passphrase
\bwill\b:rewrite to present tense
\b[0-9]+bit[^*"'%,]: NN-bit
\b([02-9]|[1-9][0-9]+) bit\b: NN-bit
[0-9]+-bits:NN bits or NN-bit
\bvery\b:rephrase using an alternative word
\bjust\b:rephrase using an alternative word
\bCurl\b=curl
\bcURL\b=curl
\bLibcurl\b=libcurl
\bLibCurl\b=libcurl
host name:hostname
host names:hostnames
file name:filename
file names:filenames
fist:first
user name:username
user names:usernames
pass phrase:passphrase
will:rewrite to present tense
32 bit:32-bit
16 bit:16-bit
64 bit:64-bit
32-bits:32 bits
16-bits:16 bits
64-bits:64 bits
very:rephrase using an alternative word
just:rephrase using an alternative word
Curl=curl
cURL=curl
Libcurl=libcurl
LibCurl=libcurl
---WWW::Curl
---NET::Curl
---Curl Corporation
\bmanpages[^./;=&{:-]:man pages
\bmanpage[^si./;=&{:-]:man page
manpages:man pages
manpage:man page
favour:favor
basically:rephrase?
However,:rephrase?