badwords: move into ./scripts, speed up

- 'badwords' is now a target in Makefile.am

- change badwords.txt to specify plain "words" instead of regexes so the
  script can build single regexes when scanning, which makes the script
  perform much faster (~6 times faster)

Closes #20869
This commit is contained in:
Daniel Stenberg 2026-03-09 13:32:14 +01:00
parent 248dd9e55f
commit 713287188e
No known key found for this signature in database
GPG key ID: 5CC908FDB71E12C2
7 changed files with 123 additions and 78 deletions

View file

@ -1,7 +0,0 @@
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# whitelisted uses of bad words
# file:[line]:rule
docs/FAQ.md::\bwill\b

View file

@ -1,126 +0,0 @@
#!/usr/bin/env perl
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
# bad[:=]correct
#
# If separator is '=', the string will be compared case sensitively.
# If separator is ':', the check is done case insensitively.
#
# To add white listed uses of bad words that are removed before checking for
# the bad ones:
#
# ---(accepted word)
#
use strict;
use warnings;
my @whitelist;
my %alt;
my %exactcase;
my $skip_indented = 1;
if($ARGV[0] eq "-a") {
shift @ARGV;
$skip_indented = 0;
}
my %wl;
if($ARGV[0] eq "-w") {
shift @ARGV;
my $file = shift @ARGV;
open(W, "<$file") or die "Cannot open '$file': $!";
while(<W>) {
if(/^#/) {
# allow #-comments
next;
}
if(/^([^:]*):(\d*):(.*)/) {
$wl{"$1:$2:$3"}=1;
#print STDERR "whitelisted $1:$2:$3\n";
}
}
close(W);
}
my @w;
while(<STDIN>) {
chomp;
if($_ =~ /^#/) {
next;
}
if($_ =~ /^---(.*)/) {
push @whitelist, $1;
}
elsif($_ =~ /^(.*)([:=])(.*)/) {
my ($bad, $sep, $better)=($1, $2, $3);
push @w, $bad;
$alt{$bad} = $better;
if($sep eq "=") {
$exactcase{$bad} = 1;
}
}
}
my $errors = 0;
sub file {
my ($f) = @_;
my $l = 0;
open(F, "<$f");
while(<F>) {
my $in = $_;
$l++;
chomp $in;
if($skip_indented && $in =~ /^ /) {
next;
}
# remove the link part
$in =~ s/(\[.*\])\(.*\)/$1/g;
# remove backticked texts
$in =~ s/\`.*\`//g;
# remove whitelisted patterns
for my $p (@whitelist) {
$in =~ s/$p//g;
}
foreach my $w (@w) {
my $case = $exactcase{$w};
if(($in =~ /^(.*)$w/i && !$case) ||
($in =~ /^(.*)$w/ && $case) ) {
my $p = $1;
my $c = length($p)+1;
my $ch = "$f:$l:$w";
if($wl{$ch}) {
# whitelisted filename + line + word
#print STDERR "$ch found but whitelisted\n";
next;
}
$ch = $f . "::" . $w;
if($wl{$ch}) {
# whitelisted filename + word
#print STDERR "$ch found but whitelisted\n";
next;
}
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
printf STDERR " %4d | %s\n", $l, $in;
printf STDERR " | %*s^%s\n", length($p), " ",
"~" x (length($w)-1);
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
$errors++;
}
}
}
close(F);
}
my @filemasks = @ARGV;
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
while(my $each = <$git_ls_files>) {
chomp $each;
file($each);
}
close $git_ls_files;
exit $errors;

View file

@ -1,97 +0,0 @@
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# SPDX-License-Identifier: curl
#
back-end:backend
\be-mail[^/]:email
run-time:runtime
set-up:setup
tool chain:toolchain
tool-chain:toolchain
wild-card:wildcard
wild card:wildcard
\bthread ?safe[^."t]:thread-safe
\bthread ?unsafe[^."t]:thread-unsafe
multi ?thread:multi-thread
\bit's:it is
aren't:are not
can't:cannot
could've:could have
couldn't:could not
didn't:did not
doesn't:does not
don't:do not
haven't:have not
i'd:I would
i'll:I will
i'm:I am
i've:I have
isn't:is not
it'd:it would
it'll:it will
might've:might have
needn't:need not
should've:should have
shouldn't:should not
that's:that is
there's:there is
they'd:They would
they'll:They will
they're:They are
they've:They have
this'll:this will
wasn't:was not
we'd:we would
we'll:we will
we're:we are
we've:we have
weren't:were not
won't:will not
would've:would have
wouldn't:would not
you'd:you would
you'll:you will
you're:you are
you've:you have
a html: an html
a http: an http
a ftp: an ftp
a IPv4: an IPv4
a IPv6: an IPv6
url [^=]=URL
[^/]internet\b=Internet
isation:ization
[^.]\. And: Rewrite it somehow?
^(And|So|But) = Rewrite it somehow?
\. But: Rewrite it somehow?
\. So : Rewrite without "so" ?
dir [^=]=directory
Dir [^=]=Directory
sub-director:subdirector
web page:webpage
host name\b:hostname
host names\b:hostnames
[^;<]file name\b:filename
file names\b:filenames
\bfist\b:first
\buser name\b:username
\buser names\b:usernames
\bpass phrase:passphrase
\bwill\b:rewrite to present tense
\b[0-9]+bit[^*"'%,]: NN-bit
\b([02-9]|[1-9][0-9]+) bit\b: NN-bit
[0-9]+-bits:NN bits or NN-bit
\bvery\b:rephrase using an alternative word
\bjust\b:rephrase using an alternative word
\bCurl\b=curl
\bcURL\b=curl
\bLibcurl\b=libcurl
\bLibCurl\b=libcurl
---WWW::Curl
---NET::Curl
---Curl Corporation
\bmanpages[^./;=&{:-]:man pages
\bmanpage[^si./;=&{:-]:man page
favour:favor
basically:rephrase?
However,:rephrase?