diff --git a/.github/workflows/checkdocs.yml b/.github/workflows/checkdocs.yml index 87571076a3..c3f9875a89 100644 --- a/.github/workflows/checkdocs.yml +++ b/.github/workflows/checkdocs.yml @@ -124,7 +124,7 @@ jobs: persist-credentials: false - name: 'badwords' - run: .github/scripts/badwords.pl -w .github/scripts/badwords.ok '**.md' projects/OS400/README.OS400 < .github/scripts/badwords.txt + run: ./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt - name: 'verify synopsis' run: .github/scripts/verify-synopsis.pl docs/libcurl/curl*.md diff --git a/.github/workflows/checksrc.yml b/.github/workflows/checksrc.yml index b55789a014..0a26e9c1b0 100644 --- a/.github/workflows/checksrc.yml +++ b/.github/workflows/checksrc.yml @@ -181,4 +181,4 @@ jobs: - name: 'badwords' run: | # we allow some extra in source code - grep -Ev '(\\bwill| But: | So : )' .github/scripts/badwords.txt | .github/scripts/badwords.pl -a src lib include docs/examples + grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples diff --git a/Makefile.am b/Makefile.am index 1913f5d007..4a76fcb12e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -181,3 +181,7 @@ tidy: clean-local: (cd tests && $(MAKE) clean) + +badwords: + grep -Ev '^(will:|But=|So=|And=| url=)' ./scripts/badwords.txt | ./scripts/badwords -a src lib include docs/examples + ./scripts/badwords -w ./scripts/badwords.ok '**.md' projects/OS400/README.OS400 < ./scripts/badwords.txt diff --git a/scripts/Makefile.am b/scripts/Makefile.am index b35c1f2a37..36957d1cfb 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -22,11 +22,12 @@ # ########################################################################### -EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl checksrc-all.pl \ - mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd cdall cd2cd managen \ - dmaketgz maketgz release-tools.sh verify-release cmakelint.sh mdlinkcheck \ - CMakeLists.txt perlcheck.sh pythonlint.sh spacecheck.pl randdisable wcurl \ - top-complexity extract-unit-protos .checksrc +EXTRA_DIST = coverage.sh completion.pl firefox-db2pem.sh checksrc.pl \ + checksrc-all.pl mk-ca-bundle.pl mk-unity.pl schemetable.c cd2nroff nroff2cd \ + cdall cd2cd managen dmaketgz maketgz release-tools.sh verify-release \ + cmakelint.sh mdlinkcheck CMakeLists.txt perlcheck.sh pythonlint.sh \ + spacecheck.pl randdisable wcurl top-complexity extract-unit-protos \ + .checksrc badwords badwords.ok badwords.txt dist_bin_SCRIPTS = wcurl diff --git a/.github/scripts/badwords.pl b/scripts/badwords similarity index 51% rename from .github/scripts/badwords.pl rename to scripts/badwords index 255cb92c8f..149150ef23 100755 --- a/.github/scripts/badwords.pl +++ b/scripts/badwords @@ -17,7 +17,11 @@ use strict; use warnings; -my @whitelist; +my @whitelist = ( + # ignore what looks like URLs + '(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)', + # ignore bolded sections + '\*\*(.*?)\*\*'); my %alt; my %exactcase; my $skip_indented = 1; @@ -45,6 +49,7 @@ if($ARGV[0] eq "-w") { } my @w; +my @exact; while() { chomp; if($_ =~ /^#/) { @@ -55,16 +60,55 @@ while() { } elsif($_ =~ /^(.*)([:=])(.*)/) { my ($bad, $sep, $better)=($1, $2, $3); - push @w, $bad; $alt{$bad} = $better; if($sep eq "=") { - $exactcase{$bad} = 1; + push @exact, $bad; + } + else { + push @w, $bad; } } } +# Build a single combined regex for case-insensitive words +my $re_ci; +if(@w) { + my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @w); + $re_ci = qr/($pat)/i; +} + +# Build a single combined regex for case-sensitive (exact) words +my $re_cs; +if(@exact) { + my $pat = join('|', map { '\b'.quotemeta($_).'\b' } @exact); + $re_cs = qr/($pat)/; +} + my $errors = 0; +sub highlight { + my ($p, $w, $in, $f, $l) = @_; + + my $c = length($p)+1; + my $ch = "$f:$l:$w"; + if($wl{$ch}) { + # whitelisted filename + line + word + return; + } + $ch = $f . "::" . $w; + if($wl{$ch}) { + # whitelisted filename + word + return; + } + + print STDERR "$f:$l:$c: error: found bad word \"$w\"\n"; + printf STDERR " %4d | %s\n", $l, $in; + printf STDERR " | %*s^%s\n", length($p), " ", + "~" x (length($w)-1); + printf STDERR " maybe use \"%s\" instead?\n", $alt{$w}; + $errors++; +} + sub file { my ($f) = @_; my $l = 0; @@ -80,36 +124,22 @@ sub file { $in =~ s/(\[.*\])\(.*\)/$1/g; # remove backticked texts $in =~ s/\`.*\`//g; - # remove whitelisted patterns + # remove whitelisted patterns (pre-compiled) for my $p (@whitelist) { $in =~ s/$p//g; } - foreach my $w (@w) { - my $case = $exactcase{$w}; - if(($in =~ /^(.*)$w/i && !$case) || - ($in =~ /^(.*)$w/ && $case) ) { - my $p = $1; - my $c = length($p)+1; - - my $ch = "$f:$l:$w"; - if($wl{$ch}) { - # whitelisted filename + line + word - #print STDERR "$ch found but whitelisted\n"; - next; - } - $ch = $f . "::" . $w; - if($wl{$ch}) { - # whitelisted filename + word - #print STDERR "$ch found but whitelisted\n"; - next; - } - - print STDERR "$f:$l:$c: error: found bad word \"$w\"\n"; - printf STDERR " %4d | %s\n", $l, $in; - printf STDERR " | %*s^%s\n", length($p), " ", - "~" x (length($w)-1); - printf STDERR " maybe use \"%s\" instead?\n", $alt{$w}; - $errors++; + # case-insensitive bad words + if($re_ci) { + while($in =~ /^(.*)$re_ci/i) { + highlight($1, $2, $in, $f, $l); + last; + } + } + # case-sensitive (exact) bad words + if($re_cs) { + while($in =~ /^(.*)$re_cs/) { + highlight($1, $2, $in, $f, $l); + last; } } } @@ -118,9 +148,18 @@ sub file { my @filemasks = @ARGV; open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!"; +my @files; while(my $each = <$git_ls_files>) { chomp $each; - file($each); + push @files, $each; } close $git_ls_files; + +my $onum = scalar(@files); +my $num; +for my $e (@files) { + #printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum; + file($e); +} + exit $errors; diff --git a/.github/scripts/badwords.ok b/scripts/badwords.ok similarity index 80% rename from .github/scripts/badwords.ok rename to scripts/badwords.ok index fe2d9cfaf1..549901f9b2 100644 --- a/.github/scripts/badwords.ok +++ b/scripts/badwords.ok @@ -4,4 +4,5 @@ # # whitelisted uses of bad words # file:[line]:rule -docs/FAQ.md::\bwill\b +docs/FAQ.md::will +docs/FAQ.md::Will diff --git a/.github/scripts/badwords.txt b/scripts/badwords.txt similarity index 50% rename from .github/scripts/badwords.txt rename to scripts/badwords.txt index 0e34fe3569..8879ad6dc1 100644 --- a/.github/scripts/badwords.txt +++ b/scripts/badwords.txt @@ -3,17 +3,17 @@ # SPDX-License-Identifier: curl # back-end:backend -\be-mail[^/]:email +e-mail:email run-time:runtime set-up:setup tool chain:toolchain tool-chain:toolchain wild-card:wildcard wild card:wildcard -\bthread ?safe[^."t]:thread-safe -\bthread ?unsafe[^."t]:thread-unsafe -multi ?thread:multi-thread -\bit's:it is +thread safe:thread-safe +thread unsafe:thread-unsafe +multi thread:multi-thread +it's:it is aren't:are not can't:cannot could've:could have @@ -53,45 +53,45 @@ you'd:you would you'll:you will you're:you are you've:you have - a html: an html - a http: an http - a ftp: an ftp - a IPv4: an IPv4 - a IPv6: an IPv6 - url [^=]=URL -[^/]internet\b=Internet +a html:an html +a http:an http +a ftp:an ftp +a IPv4:an IPv4 +a IPv6:an IPv6 + url= URL +internet=Internet isation:ization -[^.]\. And: Rewrite it somehow? -^(And|So|But) = Rewrite it somehow? -\. But: Rewrite it somehow? -\. So : Rewrite without "so" ? - dir [^=]=directory - Dir [^=]=Directory -sub-director:subdirector +So=Rewrite it somehow? +And=Rewrite it somehow? +But=Rewrite it somehow? +sub-directory:subdirectory web page:webpage -host name\b:hostname -host names\b:hostnames -[^;<]file name\b:filename -file names\b:filenames -\bfist\b:first -\buser name\b:username -\buser names\b:usernames -\bpass phrase:passphrase -\bwill\b:rewrite to present tense -\b[0-9]+bit[^*"'%,]: NN-bit -\b([02-9]|[1-9][0-9]+) bit\b: NN-bit -[0-9]+-bits:NN bits or NN-bit -\bvery\b:rephrase using an alternative word -\bjust\b:rephrase using an alternative word -\bCurl\b=curl -\bcURL\b=curl -\bLibcurl\b=libcurl -\bLibCurl\b=libcurl +host name:hostname +host names:hostnames +file name:filename +file names:filenames +fist:first +user name:username +user names:usernames +pass phrase:passphrase +will:rewrite to present tense +32 bit:32-bit +16 bit:16-bit +64 bit:64-bit +32-bits:32 bits +16-bits:16 bits +64-bits:64 bits +very:rephrase using an alternative word +just:rephrase using an alternative word +Curl=curl +cURL=curl +Libcurl=libcurl +LibCurl=libcurl ---WWW::Curl ---NET::Curl ---Curl Corporation -\bmanpages[^./;=&{:-]:man pages -\bmanpage[^si./;=&{:-]:man page +manpages:man pages +manpage:man page favour:favor basically:rephrase? However,:rephrase?