#!/usr/bin/env perl # Copyright (C) Daniel Stenberg, , et al. # # SPDX-License-Identifier: curl # # bad[:=]correct # # If separator is '=', the string will be compared case sensitively. # If separator is ':', the check is done case insensitively. # # To add white listed uses of bad words that are removed before checking for # the bad ones: # # ---(accepted word) # use strict; use warnings; use File::Basename; my @whitelist = ( # ignore what looks like URLs '(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&\'\(\)*+,;=]+)', # remove bolded sections '\*\*.*?\*\*', # remove backticked texts '\`.*?\`' ); my %alt; my %exactcase; my $skip_indented = 1; if($ARGV[0] eq "-a") { shift @ARGV; $skip_indented = 0; } my %wl; if($ARGV[0] eq "-w") { shift @ARGV; my $file = shift @ARGV; open(W, "<$file") or die "Cannot open '$file': $!"; while() { if(/^#/) { # allow #-comments next; } if(/^([^:]*):(\d*):(.*)/) { $wl{"$1:$2:$3"}=1; #print STDERR "whitelisted $1:$2:$3\n"; } } close(W); } my @w; my @exact; while() { chomp; if($_ =~ /^#/) { next; } if($_ =~ /^---(.+)/) { push @whitelist, $1; } elsif($_ =~ /^(.*)([:=])(.*)/) { my ($bad, $sep, $better)=($1, $2, $3); if($sep eq "=") { $alt{$bad} = $better; push @exact, $bad; } else { $alt{lc($bad)} = $better; push @w, $bad; } } } # Build a single combined regex for case-insensitive words my $re_ci; if(@w) { my $pat = join('|', map { quotemeta($_) } @w); $re_ci = qr/\b($pat)\b/i; } # Build a single combined regex for case-sensitive (exact) words my $re_cs; if(@exact) { my $pat = join('|', map { quotemeta($_) } @exact); $re_cs = qr/\b($pat)\b/; } # Build a single combined regex for removing whitelisted content my $re_wl; my $pat = join('|', map { $_ } @whitelist); $re_wl = qr/($pat)/; my $errors = 0; sub highlight { my ($p, $w, $in, $f, $l, $lookup) = @_; my $c = length($p)+1; my $ch; my $dir = dirname($f); $ch = $dir . "/" . "::" . $w; if($wl{$ch}) { # whitelisted dirname + word return; } my $updir = dirname($dir); if($dir ne $updir) { $ch = $updir . "/" . "::" . $w; if($wl{$ch}) { # whitelisted upper dirname + word return; } } $ch = $f . "::" . $w; if($wl{$ch}) { # whitelisted filename + word return; } $ch = "$f:$l:$w"; if($wl{$ch}) { # whitelisted filename + line + word return; } print STDERR "$f:$l:$c: error: found bad word \"$w\"\n"; printf STDERR " %4d | %s\n", $l, $in; printf STDERR " | %*s^%s\n", length($p), " ", "~" x (length($w)-1); printf STDERR " maybe use \"%s\" instead?\n", $alt{$lookup}; $errors++; } sub file { my ($f) = @_; my $l = 0; open(F, "<$f"); while() { my $in = $_; $l++; chomp $in; if($skip_indented && $in =~ /^ /) { next; } # remove the link part $in =~ s/(\[.*\])\(.*\)/$1/g; # remove whitelisted patterns (pre-compiled) if($re_wl) { $in =~ s/${re_wl}//ig; } # case-insensitive bad words if($re_ci) { if($in =~ /^(.*)$re_ci/i) { highlight($1, $2, $in, $f, $l, lc($2)); } } # case-sensitive (exact) bad words if($re_cs) { if($in =~ /^(.*)$re_cs/) { highlight($1, $2, $in, $f, $l, $2); } } } close(F); } my @filemasks = @ARGV; open(my $git_ls_files, '-|', 'git', 'ls-files', '--', @filemasks) or die "Failed running git ls-files: $!"; my @files; while(my $each = <$git_ls_files>) { chomp $each; push @files, $each; } close $git_ls_files; my $onum = scalar(@files); my $num; for my $e (@files) { #printf STDERR "Complete: %d%%\r", $num++ * 100 / $onum; file($e); } exit $errors;