spacecheck.pl: check for non-ASCII chars, fix fallouts

Reported-by: James Fuller
Assisted-by: Dan Fandrich

Closes #17247
This commit is contained in:
Viktor Szakats 2025-05-03 18:11:29 +02:00
parent e1f87a093b
commit 838dc53bb7
No known key found for this signature in database
GPG key ID: B5ABD165E2AEF201
4 changed files with 38 additions and 6 deletions

View file

@ -47,6 +47,31 @@ my @space_at_eol = (
"^tests/data/test",
);
my @non_ascii_allowed = (
'\xC3\xA1', # UTF-8 for https://codepoints.net/U+00E1 LATIN SMALL LETTER A WITH ACUTE
'\xC3\xA5', # UTF-8 for https://codepoints.net/U+00E5 LATIN SMALL LETTER A WITH RING ABOVE
'\xC3\xA4', # UTF-8 for https://codepoints.net/U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
'\xC3\xB6', # UTF-8 for https://codepoints.net/U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
'\xC2\xB1', # UTF-8 for https://codepoints.net/U+00B1 PLUS-MINUS SIGN
'\xC2\xA7', # UTF-8 for https://codepoints.net/U+00A7 SECTION SIGN
'\xC3\x9F', # UTF-8 for https://codepoints.net/U+00DF LATIN SMALL LETTER SHARP S
'\xF0\x9F\x99\x8F', # UTF-8 for https://codepoints.net/U+1f64f PERSON WITH FOLDED HANDS
);
my $non_ascii_allowed = join(', ', @non_ascii_allowed);
my @non_ascii = (
".github/scripts/spellcheck.words",
".mailmap",
"RELEASE-NOTES",
"docs/BINDINGS.md",
"docs/CIPHERS.md",
"docs/THANKS",
"docs/THANKS-filter",
"tests/libtest/lib1560.c",
"^tests/data/test",
);
sub fn_match {
my ($filename, @masklist) = @_;
@ -134,6 +159,13 @@ while(my $filename = <$git_ls_files>) {
push @err, "content: has binary contents";
}
$content =~ s/[$non_ascii_allowed]//g;
if(!fn_match($filename, @non_ascii) &&
$content =~ /([\x80-\xff]+)/) {
push @err, "content: has non-ASCII: '$1'";
}
if(@err) {
$issues++;
foreach my $err (@err) {