mirror of
https://github.com/curl/curl.git
synced 2026-04-14 18:11:40 +03:00
badwords: move into ./scripts, speed up
- 'badwords' is now a target in Makefile.am - change badwords.txt to specify plain "words" instead of regexes so the script can build single regexes when scanning, which makes the script perform much faster (~6 times faster) Closes #20869
This commit is contained in:
parent
248dd9e55f
commit
713287188e
7 changed files with 123 additions and 78 deletions
2
.github/workflows/checkdocs.yml
vendored
2
.github/workflows/checkdocs.yml
vendored
|
|
@ -124,7 +124,7 @@ jobs:
|
|||
persist-credentials: false
|
||||
|
||||
- name: 'badwords'
|
||||
run: .github/scripts/badwords.pl -w .github/scripts/badwords.ok '**.md' projects/OS400/README.OS400 < .github/scripts/badwords.txt
|
||||
run: ./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt
|
||||
|
||||
- name: 'verify synopsis'
|
||||
run: .github/scripts/verify-synopsis.pl docs/libcurl/curl*.md
|
||||
|
|
|
|||
2
.github/workflows/checksrc.yml
vendored
2
.github/workflows/checksrc.yml
vendored
|
|
@ -181,4 +181,4 @@ jobs:
|
|||
- name: 'badwords'
|
||||
run: |
|
||||
# we allow some extra in source code
|
||||
grep -Ev '(\\bwill| But: | So : )' .github/scripts/badwords.txt | .github/scripts/badwords.pl -a src lib include docs/examples
|
||||
grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples
|
||||
|
|
|
|||
|
|
@ -181,3 +181,7 @@ tidy:
|
|||
|
||||
clean-local:
|
||||
(cd tests && $(MAKE) clean)
|
||||
|
||||
badwords:
|
||||
grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples
|
||||
./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt
|
||||
|
|
|
|||
|
|
@ -22,11 +22,12 @@
|
|||
#
|
||||
###########################################################################
|
||||
|
||||
EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl checksrc-all.pl \
|
||||
mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd cdall cd2cd managen \
|
||||
dmaketgz maketgz release-tools.sh verify-release cmakelint.sh mdlinkcheck \
|
||||
CMakeLists.txt perlcheck.sh pythonlint.sh spacecheck.pl randdisable wcurl \
|
||||
top-complexity extract-unit-protos .checksrc
|
||||
EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl \
|
||||
checksrc-all.pl mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd \
|
||||
cdall cd2cd managen dmaketgz maketgz release-tools.sh verify-release \
|
||||
cmakelint.sh mdlinkcheck CMakeLists.txt perlcheck.sh pythonlint.sh \
|
||||
spacecheck.pl randdisable wcurl top-complexity extract-unit-protos \
|
||||
.checksrc badwords badwords.ok badwords.txt
|
||||
|
||||
dist_bin_SCRIPTS = wcurl
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,11 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
|
||||
my @whitelist;
|
||||
my @whitelist = (
|
||||
# ignore what looks like URLs
|
||||
'(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)',
|
||||
# ignore bolded sections
|
||||
'\*\*(.*?)\*\*');
|
||||
my %alt;
|
||||
my %exactcase;
|
||||
my $skip_indented = 1;
|
||||
|
|
@ -45,6 +49,7 @@ if($ARGV[0] eq "-w") {
|
|||
}
|
||||
|
||||
my @w;
|
||||
my @exact;
|
||||
while(<STDIN>) {
|
||||
chomp;
|
||||
if($_ =~ /^#/) {
|
||||
|
|
@ -55,16 +60,55 @@ while(<STDIN>) {
|
|||
}
|
||||
elsif($_ =~ /^(.*)([:=])(.*)/) {
|
||||
my ($bad, $sep, $better)=($1, $2, $3);
|
||||
push @w, $bad;
|
||||
$alt{$bad} = $better;
|
||||
if($sep eq "=") {
|
||||
$exactcase{$bad} = 1;
|
||||
push @exact, $bad;
|
||||
}
|
||||
else {
|
||||
push @w, $bad;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Build a single combined regex for case-insensitive words
|
||||
my $re_ci;
|
||||
if(@w) {
|
||||
my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @w);
|
||||
$re_ci = qr/($pat)/i;
|
||||
}
|
||||
|
||||
# Build a single combined regex for case-sensitive (exact) words
|
||||
my $re_cs;
|
||||
if(@exact) {
|
||||
my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @exact);
|
||||
$re_cs = qr/($pat)/;
|
||||
}
|
||||
|
||||
my $errors = 0;
|
||||
|
||||
sub highlight {
|
||||
my ($p, $w, $in, $f, $l) = @_;
|
||||
|
||||
my $c = length($p)+1;
|
||||
my $ch = "$f:$l:$w";
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + line + word
|
||||
return;
|
||||
}
|
||||
$ch = $f . "::" . $w;
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + word
|
||||
return;
|
||||
}
|
||||
|
||||
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
|
||||
printf STDERR " %4d | %s\n", $l, $in;
|
||||
printf STDERR " | %*s^%s\n", length($p), " ",
|
||||
"~" x (length($w)-1);
|
||||
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
|
||||
$errors++;
|
||||
}
|
||||
|
||||
sub file {
|
||||
my ($f) = @_;
|
||||
my $l = 0;
|
||||
|
|
@ -80,36 +124,22 @@ sub file {
|
|||
$in =~ s/(\[.*\])\(.*\)/$1/g;
|
||||
# remove backticked texts
|
||||
$in =~ s/\`.*\`//g;
|
||||
# remove whitelisted patterns
|
||||
# remove whitelisted patterns (pre-compiled)
|
||||
for my $p (@whitelist) {
|
||||
$in =~ s/$p//g;
|
||||
}
|
||||
foreach my $w (@w) {
|
||||
my $case = $exactcase{$w};
|
||||
if(($in =~ /^(.*)$w/i && !$case) ||
|
||||
($in =~ /^(.*)$w/ && $case) ) {
|
||||
my $p = $1;
|
||||
my $c = length($p)+1;
|
||||
|
||||
my $ch = "$f:$l:$w";
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + line + word
|
||||
#print STDERR "$ch found but whitelisted\n";
|
||||
next;
|
||||
}
|
||||
$ch = $f . "::" . $w;
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + word
|
||||
#print STDERR "$ch found but whitelisted\n";
|
||||
next;
|
||||
}
|
||||
|
||||
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
|
||||
printf STDERR " %4d | %s\n", $l, $in;
|
||||
printf STDERR " | %*s^%s\n", length($p), " ",
|
||||
"~" x (length($w)-1);
|
||||
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
|
||||
$errors++;
|
||||
# case-insensitive bad words
|
||||
if($re_ci) {
|
||||
while($in =~ /^(.*)$re_ci/i) {
|
||||
highlight($1, $2, $in, $f, $l);
|
||||
last;
|
||||
}
|
||||
}
|
||||
# case-sensitive (exact) bad words
|
||||
if($re_cs) {
|
||||
while($in =~ /^(.*)$re_cs/) {
|
||||
highlight($1, $2, $in, $f, $l);
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -118,9 +148,18 @@ sub file {
|
|||
|
||||
my @filemasks = @ARGV;
|
||||
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
|
||||
my @files;
|
||||
while(my $each = <$git_ls_files>) {
|
||||
chomp $each;
|
||||
file($each);
|
||||
push @files, $each;
|
||||
}
|
||||
close $git_ls_files;
|
||||
|
||||
my $onum = scalar(@files);
|
||||
my $num;
|
||||
for my $e (@files) {
|
||||
#printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum;
|
||||
file($e);
|
||||
}
|
||||
|
||||
exit $errors;
|
||||
|
|
@ -4,4 +4,5 @@
|
|||
#
|
||||
# whitelisted uses of bad words
|
||||
# file:[line]:rule
|
||||
docs/FAQ.md::\bwill\b
|
||||
docs/FAQ.md::will
|
||||
docs/FAQ.md::Will
|
||||
|
|
@ -3,17 +3,17 @@
|
|||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
back-end:backend
|
||||
\be-mail[^/]:email
|
||||
e-mail:email
|
||||
run-time:runtime
|
||||
set-up:setup
|
||||
tool chain:toolchain
|
||||
tool-chain:toolchain
|
||||
wild-card:wildcard
|
||||
wild card:wildcard
|
||||
\bthread ?safe[^."t]:thread-safe
|
||||
\bthread ?unsafe[^."t]:thread-unsafe
|
||||
multi ?thread:multi-thread
|
||||
\bit's:it is
|
||||
thread safe:thread-safe
|
||||
thread unsafe:thread-unsafe
|
||||
multi thread:multi-thread
|
||||
it's:it is
|
||||
aren't:are not
|
||||
can't:cannot
|
||||
could've:could have
|
||||
|
|
@ -53,45 +53,45 @@ you'd:you would
|
|||
you'll:you will
|
||||
you're:you are
|
||||
you've:you have
|
||||
a html: an html
|
||||
a http: an http
|
||||
a ftp: an ftp
|
||||
a IPv4: an IPv4
|
||||
a IPv6: an IPv6
|
||||
url [^=]=URL
|
||||
[^/]internet\b=Internet
|
||||
a html:an html
|
||||
a http:an http
|
||||
a ftp:an ftp
|
||||
a IPv4:an IPv4
|
||||
a IPv6:an IPv6
|
||||
url= URL
|
||||
internet=Internet
|
||||
isation:ization
|
||||
[^.]\. And: Rewrite it somehow?
|
||||
^(And|So|But) = Rewrite it somehow?
|
||||
\. But: Rewrite it somehow?
|
||||
\. So : Rewrite without "so" ?
|
||||
dir [^=]=directory
|
||||
Dir [^=]=Directory
|
||||
sub-director:subdirector
|
||||
So=Rewrite it somehow?
|
||||
And=Rewrite it somehow?
|
||||
But=Rewrite it somehow?
|
||||
sub-directory:subdirectory
|
||||
web page:webpage
|
||||
host name\b:hostname
|
||||
host names\b:hostnames
|
||||
[^;<]file name\b:filename
|
||||
file names\b:filenames
|
||||
\bfist\b:first
|
||||
\buser name\b:username
|
||||
\buser names\b:usernames
|
||||
\bpass phrase:passphrase
|
||||
\bwill\b:rewrite to present tense
|
||||
\b[0-9]+bit[^*"'%,]: NN-bit
|
||||
\b([02-9]|[1-9][0-9]+) bit\b: NN-bit
|
||||
[0-9]+-bits:NN bits or NN-bit
|
||||
\bvery\b:rephrase using an alternative word
|
||||
\bjust\b:rephrase using an alternative word
|
||||
\bCurl\b=curl
|
||||
\bcURL\b=curl
|
||||
\bLibcurl\b=libcurl
|
||||
\bLibCurl\b=libcurl
|
||||
host name:hostname
|
||||
host names:hostnames
|
||||
file name:filename
|
||||
file names:filenames
|
||||
fist:first
|
||||
user name:username
|
||||
user names:usernames
|
||||
pass phrase:passphrase
|
||||
will:rewrite to present tense
|
||||
32 bit:32-bit
|
||||
16 bit:16-bit
|
||||
64 bit:64-bit
|
||||
32-bits:32 bits
|
||||
16-bits:16 bits
|
||||
64-bits:64 bits
|
||||
very:rephrase using an alternative word
|
||||
just:rephrase using an alternative word
|
||||
Curl=curl
|
||||
cURL=curl
|
||||
Libcurl=libcurl
|
||||
LibCurl=libcurl
|
||||
---WWW::Curl
|
||||
---NET::Curl
|
||||
---Curl Corporation
|
||||
\bmanpages[^./;=&{:-]:man pages
|
||||
\bmanpage[^si./;=&{:-]:man page
|
||||
manpages:man pages
|
||||
manpage:man page
|
||||
favour:favor
|
||||
basically:rephrase?
|
||||
However,:rephrase?
|
||||
Loading…
Add table
Add a link
Reference in a new issue