mirror of
https://github.com/curl/curl.git
synced 2026-04-15 00:41:41 +03:00
badwords: move into ./scripts, speed up
- 'badwords' is now a target in Makefile.am - change badwords.txt to specify plain "words" instead of regexes so the script can build single regexes when scanning, which makes the script perform much faster (~6 times faster) Closes #20869
This commit is contained in:
parent
248dd9e55f
commit
713287188e
7 changed files with 123 additions and 78 deletions
7
.github/scripts/badwords.ok
vendored
7
.github/scripts/badwords.ok
vendored
|
|
@ -1,7 +0,0 @@
|
|||
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
|
||||
#
|
||||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
# whitelisted uses of bad words
|
||||
# file:[line]:rule
|
||||
docs/FAQ.md::\bwill\b
|
||||
126
.github/scripts/badwords.pl
vendored
126
.github/scripts/badwords.pl
vendored
|
|
@ -1,126 +0,0 @@
|
|||
#!/usr/bin/env perl
|
||||
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
|
||||
#
|
||||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
# bad[:=]correct
|
||||
#
|
||||
# If separator is '=', the string will be compared case sensitively.
|
||||
# If separator is ':', the check is done case insensitively.
|
||||
#
|
||||
# To add white listed uses of bad words that are removed before checking for
|
||||
# the bad ones:
|
||||
#
|
||||
# ---(accepted word)
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
my @whitelist;
|
||||
my %alt;
|
||||
my %exactcase;
|
||||
my $skip_indented = 1;
|
||||
|
||||
if($ARGV[0] eq "-a") {
|
||||
shift @ARGV;
|
||||
$skip_indented = 0;
|
||||
}
|
||||
my %wl;
|
||||
if($ARGV[0] eq "-w") {
|
||||
shift @ARGV;
|
||||
my $file = shift @ARGV;
|
||||
open(W, "<$file") or die "Cannot open '$file': $!";
|
||||
while(<W>) {
|
||||
if(/^#/) {
|
||||
# allow #-comments
|
||||
next;
|
||||
}
|
||||
if(/^([^:]*):(\d*):(.*)/) {
|
||||
$wl{"$1:$2:$3"}=1;
|
||||
#print STDERR "whitelisted $1:$2:$3\n";
|
||||
}
|
||||
}
|
||||
close(W);
|
||||
}
|
||||
|
||||
my @w;
|
||||
while(<STDIN>) {
|
||||
chomp;
|
||||
if($_ =~ /^#/) {
|
||||
next;
|
||||
}
|
||||
if($_ =~ /^---(.*)/) {
|
||||
push @whitelist, $1;
|
||||
}
|
||||
elsif($_ =~ /^(.*)([:=])(.*)/) {
|
||||
my ($bad, $sep, $better)=($1, $2, $3);
|
||||
push @w, $bad;
|
||||
$alt{$bad} = $better;
|
||||
if($sep eq "=") {
|
||||
$exactcase{$bad} = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
my $errors = 0;
|
||||
|
||||
sub file {
|
||||
my ($f) = @_;
|
||||
my $l = 0;
|
||||
open(F, "<$f");
|
||||
while(<F>) {
|
||||
my $in = $_;
|
||||
$l++;
|
||||
chomp $in;
|
||||
if($skip_indented && $in =~ /^ /) {
|
||||
next;
|
||||
}
|
||||
# remove the link part
|
||||
$in =~ s/(\[.*\])\(.*\)/$1/g;
|
||||
# remove backticked texts
|
||||
$in =~ s/\`.*\`//g;
|
||||
# remove whitelisted patterns
|
||||
for my $p (@whitelist) {
|
||||
$in =~ s/$p//g;
|
||||
}
|
||||
foreach my $w (@w) {
|
||||
my $case = $exactcase{$w};
|
||||
if(($in =~ /^(.*)$w/i && !$case) ||
|
||||
($in =~ /^(.*)$w/ && $case) ) {
|
||||
my $p = $1;
|
||||
my $c = length($p)+1;
|
||||
|
||||
my $ch = "$f:$l:$w";
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + line + word
|
||||
#print STDERR "$ch found but whitelisted\n";
|
||||
next;
|
||||
}
|
||||
$ch = $f . "::" . $w;
|
||||
if($wl{$ch}) {
|
||||
# whitelisted filename + word
|
||||
#print STDERR "$ch found but whitelisted\n";
|
||||
next;
|
||||
}
|
||||
|
||||
print STDERR "$f:$l:$c: error: found bad word \"$w\"\n";
|
||||
printf STDERR " %4d | %s\n", $l, $in;
|
||||
printf STDERR " | %*s^%s\n", length($p), " ",
|
||||
"~" x (length($w)-1);
|
||||
printf STDERR " maybe use \"%s\" instead?\n", $alt{$w};
|
||||
$errors++;
|
||||
}
|
||||
}
|
||||
}
|
||||
close(F);
|
||||
}
|
||||
|
||||
my @filemasks = @ARGV;
|
||||
open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!";
|
||||
while(my $each = <$git_ls_files>) {
|
||||
chomp $each;
|
||||
file($each);
|
||||
}
|
||||
close $git_ls_files;
|
||||
exit $errors;
|
||||
97
.github/scripts/badwords.txt
vendored
97
.github/scripts/badwords.txt
vendored
|
|
@ -1,97 +0,0 @@
|
|||
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
|
||||
#
|
||||
# SPDX-License-Identifier: curl
|
||||
#
|
||||
back-end:backend
|
||||
\be-mail[^/]:email
|
||||
run-time:runtime
|
||||
set-up:setup
|
||||
tool chain:toolchain
|
||||
tool-chain:toolchain
|
||||
wild-card:wildcard
|
||||
wild card:wildcard
|
||||
\bthread ?safe[^."t]:thread-safe
|
||||
\bthread ?unsafe[^."t]:thread-unsafe
|
||||
multi ?thread:multi-thread
|
||||
\bit's:it is
|
||||
aren't:are not
|
||||
can't:cannot
|
||||
could've:could have
|
||||
couldn't:could not
|
||||
didn't:did not
|
||||
doesn't:does not
|
||||
don't:do not
|
||||
haven't:have not
|
||||
i'd:I would
|
||||
i'll:I will
|
||||
i'm:I am
|
||||
i've:I have
|
||||
isn't:is not
|
||||
it'd:it would
|
||||
it'll:it will
|
||||
might've:might have
|
||||
needn't:need not
|
||||
should've:should have
|
||||
shouldn't:should not
|
||||
that's:that is
|
||||
there's:there is
|
||||
they'd:They would
|
||||
they'll:They will
|
||||
they're:They are
|
||||
they've:They have
|
||||
this'll:this will
|
||||
wasn't:was not
|
||||
we'd:we would
|
||||
we'll:we will
|
||||
we're:we are
|
||||
we've:we have
|
||||
weren't:were not
|
||||
won't:will not
|
||||
would've:would have
|
||||
wouldn't:would not
|
||||
you'd:you would
|
||||
you'll:you will
|
||||
you're:you are
|
||||
you've:you have
|
||||
a html: an html
|
||||
a http: an http
|
||||
a ftp: an ftp
|
||||
a IPv4: an IPv4
|
||||
a IPv6: an IPv6
|
||||
url [^=]=URL
|
||||
[^/]internet\b=Internet
|
||||
isation:ization
|
||||
[^.]\. And: Rewrite it somehow?
|
||||
^(And|So|But) = Rewrite it somehow?
|
||||
\. But: Rewrite it somehow?
|
||||
\. So : Rewrite without "so" ?
|
||||
dir [^=]=directory
|
||||
Dir [^=]=Directory
|
||||
sub-director:subdirector
|
||||
web page:webpage
|
||||
host name\b:hostname
|
||||
host names\b:hostnames
|
||||
[^;<]file name\b:filename
|
||||
file names\b:filenames
|
||||
\bfist\b:first
|
||||
\buser name\b:username
|
||||
\buser names\b:usernames
|
||||
\bpass phrase:passphrase
|
||||
\bwill\b:rewrite to present tense
|
||||
\b[0-9]+bit[^*"'%,]: NN-bit
|
||||
\b([02-9]|[1-9][0-9]+) bit\b: NN-bit
|
||||
[0-9]+-bits:NN bits or NN-bit
|
||||
\bvery\b:rephrase using an alternative word
|
||||
\bjust\b:rephrase using an alternative word
|
||||
\bCurl\b=curl
|
||||
\bcURL\b=curl
|
||||
\bLibcurl\b=libcurl
|
||||
\bLibCurl\b=libcurl
|
||||
---WWW::Curl
|
||||
---NET::Curl
|
||||
---Curl Corporation
|
||||
\bmanpages[^./;=&{:-]:man pages
|
||||
\bmanpage[^si./;=&{:-]:man page
|
||||
favour:favor
|
||||
basically:rephrase?
|
||||
However,:rephrase?
|
||||
Loading…
Add table
Add a link
Reference in a new issue