curl-curl/scripts/spacecheck.pl

#!/usr/bin/env perl
#***************************************************************************
#                                  _   _ ____  _
#  Project                     ___| | | |  _ \| |
#                             / __| | | | |_) | |
#                            | (__| |_| |  _ <| |___
#                             \___|\___/|_| \_\_____|
#
# Copyright (C) Viktor Szakats
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at https://curl.se/docs/copyright.html.
#
# You may opt to use, copy, modify, merge, publish, distribute and/or sell
# copies of the Software, and permit persons to whom the Software is
# furnished to do so, under the terms of the COPYING file.
#
# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
# KIND, either express or implied.
#
# SPDX-License-Identifier: curl
#
###########################################################################

use strict;
use warnings;

my @tabs = (
    '^m4/zz40-xc-ovr\.m4$',
    'Makefile\.(am|example)$',
    '\.sln$',
    '^tests/data/data1706-stdout\.txt',
    '^tests/data/test',
);

my @need_crlf = (
    '\.(bat|sln)$',
);

my @double_empty_lines = (
    '^RELEASE-NOTES$',
    '^lib/.+\.(c|h)$',
    '^projects/OS400/',
    '^projects/vms/',
    '^tests/data/test',
    '\.(m4|py)$',
);

my @longline = (
    '\.github/workflows/windows\.yml$',
    '^renovate\.json$',
    '^docs/DISTROS\.md$',
    '^projects/Windows/tmpl/.+\.vcxproj$',
    '^tests/certs/srp-verifier-',
    '^tests/data/test',
);

my @non_ascii_allowed = (
    '\xC3\xB6',  # UTF-8 for https://codepoints.net/U+00F6 LATIN SMALL LETTER O WITH DIAERESIS
);

my $non_ascii_allowed = join(', ', @non_ascii_allowed);

my @non_ascii = (
    '^\.github/scripts/pyspelling\.words$',
    '^\.mailmap$',
    '^RELEASE-NOTES$',
    '^docs/BINDINGS\.md$',
    '^docs/THANKS$',
    '^docs/THANKS-filter$',
);

sub fn_match {
    my ($filename, @masklist) = @_;

    foreach my $mask (@masklist) {
        if($filename =~ $mask) {
            return 1;
        }
    }
    return 0;
}

sub eol_detect {
    my ($content) = @_;

    my $cr = () = $content =~ /\r/g;
    my $lf = () = $content =~ /\n/g;

    if($cr > 0 && $lf == 0) {
        return 'cr';
    }
    elsif($cr == 0 && $lf > 0) {
        return 'lf';
    }
    elsif($cr == 0 && $lf == 0) {
        return 'bin';
    }
    elsif($cr == $lf) {
        return 'crlf';
    }

    return '';
}

my $issues = 0;

open(my $git_ls_files, '-|', 'git', 'ls-files') or die "Failed running git ls-files: $!";
while(my $filename = <$git_ls_files>) {
    chomp $filename;

    open(my $fh, '<', $filename) or die "Cannot open '$filename': $!";
    my $content = do { local $/; <$fh> };
    close $fh;

    my @err = ();

    if(!fn_match($filename, @tabs) &&
       $content =~ /\t/) {
        push @err, 'content: has tab';
    }

    my $eol = eol_detect($content);

    if($eol eq '') {
        push @err, 'content: has mixed EOL types';
    }

    if($eol ne 'crlf' &&
       fn_match($filename, @need_crlf)) {
        push @err, 'content: must use CRLF EOL for this file type';
    }

    if($eol ne 'lf' && $content ne '' &&
       !fn_match($filename, @need_crlf)) {
        push @err, 'content: must use LF EOL for this file type';
    }

    if($content =~ /[ \t]\n/) {
        my $line;
        for my $l (split(/\n/, $content)) {
            $line++;
            if($l =~ /[ \t]$/) {
                push @err, "line $line: trailing whitespace";
            }
        }
    }

    if($content ne '' &&
       $content !~ /\n\z/) {
        push @err, 'content: has no EOL at EOF';
    }

    if($content =~ /\n\n\z/ ||
       $content =~ /\r\n\r\n\z/) {
        push @err, 'content: has multiple EOL at EOF';
    }

    if((!fn_match($filename, @double_empty_lines) &&
        ($content =~ /\n\n\n/ ||
         $content =~ /\r\n\r\n\r\n/)) ||
       $content =~ />\n\n\n+[<#]/) {
        my $line = 0;
        my $blank = 0;
        for my $l (split(/\n/, $content)) {
            chomp $l;
            $line++;
            if($l =~ /^$/) {
                if($blank) {
                    my $lineno = sprintf('duplicate empty line @ line %d', $line);
                    push @err, $lineno;
                }
                $blank = 1;
            }
            else {
                $blank = 0;
            }
        }
    }

    if(!fn_match($filename, @longline)) {
        my $line = 0;
        my $max = 192;
        for my $l (split(/\n/, $content)) {
            $line++;
            if(length($l) > $max) {
                push @err, sprintf('line %d: long (%d > %d) line', $line, length($l), $max);
            }
        }
    }

    my $line = 0;
    my $max = 79;
    for my $l (split(/\n/, $content)) {
        $line++;
        if($l =~ /( {$max,})/) {
            push @err, sprintf('line %d: repeat spaces (%d > %d)', $line, length($1), $max);
        }
    }

    my $search = $content;
    my $linepos = 0;
    while($search =~ /[^ ] "\n *" [^ ]/) {
        my $part = substr($search, 0, $+[0]);
        $search = substr($search, $+[0]);
        my $line = ($part =~ tr/\n//);
        push @err, sprintf('line %d: double spaces in folded string', $linepos + $line);
        $linepos += $line;
    }

    $search = $content;
    $linepos = 0;
    while($search =~ /\n\n *}\n/) {
        my $part = substr($search, 0, $+[0] - 1);
        $search = substr($search, $+[0]);
        my $line = ($part =~ tr/\n//);
        push @err, sprintf("line %d: '}' preceded by empty line", $linepos + $line);
        $linepos += $line + 1;
    }

    $search = $content;
    $linepos = 0;
    while($search =~ /\n\{\n\n/) {
        my $part = substr($search, 0, $+[0]);
        $search = substr($search, $+[0]);
        my $line = ($part =~ tr/\n//);
        push @err, sprintf("line %d: top-level '{' followed by empty line", $linepos + $line);
        $linepos += $line;
    }

    if($content =~ /([\x00-\x08\x0b\x0c\x0e-\x1f\x7f])/) {
        push @err, 'content: has binary contents';
    }

    if($filename !~ /tests\/data/) {
        # the tests have no allowed UTF bytes
        $content =~ s/[$non_ascii_allowed]//g;
    }

    if(!fn_match($filename, @non_ascii) &&
       ($content =~ /([\x80-\xff]+)/)) {
        my $non = $1;
        my $hex;
        for my $e (split(//, $non)) {
            $hex .= sprintf('%s%02x', $hex ? ' ': '', ord($e));
        }
        my $line;
        for my $l (split(/\n/, $content)) {
            $line++;
            if($l =~ /([\x80-\xff]+)/) {
                push @err, "line $line: has non-ASCII: '$non' ($hex)";
            }
        }
    }

    if(@err) {
        $issues++;
        foreach my $err (@err) {
            print "$filename: $err\n";
        }
    }
}
close $git_ls_files;

if($issues) {
    exit 1;
}