From 1735e5c2cf87b28b096ad91008bdb764d853b26d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:21 +0000 Subject: linkify: support Internationalized Domain Names in URLs The "\w" character class in Perl matches any word characters in the Unicode database, not just ASCII characters. So we must be prepared for that and generate links to IDNs. --- lib/PublicInbox/Linkify.pm | 5 +++-- t/linkify.t | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index d4778e7d..84960a98 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -13,6 +13,7 @@ package PublicInbox::Linkify; use strict; use warnings; use Digest::SHA qw/sha1_hex/; +use PublicInbox::Hval qw(ascii_html); my $SALT = rand; my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|gopher):// @@ -61,12 +62,12 @@ sub linkify_1 { $end = ')'; } + $url = ascii_html($url); # for IDN + # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); - # only escape ampersands, others do not match LINK_RE - $url =~ s/&/&/g; $_[0]->{$key} = $url; $beg . 'PI-LINK-'. $key . $end; ^ge; diff --git a/t/linkify.t b/t/linkify.t index fe218b91..c4923582 100644 --- a/t/linkify.t +++ b/t/linkify.t @@ -132,4 +132,16 @@ use PublicInbox::Linkify; 'punctuation with unpaired ) OK') } +if ('IDN example: ') { + my $hc = '月'; + my $u = "http://www.\x{6708}.example.com/"; + my $s = $u; + my $l = PublicInbox::Linkify->new; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + my $expect = qq{http://www.$hc.example.com/}; + is($s, $expect, 'IDN message escaped properly'); +} + done_testing(); -- cgit v1.2.3-24-ge0c7 From 02598bd82780d9b16fd091268f2dccf989489b0e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:25 +0000 Subject: nntp: be explicit about ASCII digit matches We aren't able to make sense of non-ASCII digits cf. perlrecharclass(1) / "Digits" section --- lib/PublicInbox/NNTP.pm | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 8cb6c56d..57300e89 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -437,7 +437,7 @@ sub set_nntp_headers ($$$$$) { # clobber some my $xref = xref($self, $ng, $n, $mid); $hdr->header_set('Xref', $xref); - $xref =~ s/:\d+//g; + $xref =~ s/:[0-9]+//g; $hdr->header_set('Newsgroups', (split(/ /, $xref, 2))[1]); header_append($hdr, 'List-Post', "{-primary_address}>"); if (my $url = $ng->base_url) { @@ -453,7 +453,7 @@ sub art_lookup ($$$) { my ($n, $mid); my $err; if (defined $art) { - if ($art =~ /\A\d+\z/o) { + if ($art =~ /\A[0-9]+\z/) { $err = '423 no such article number in this group'; $n = int($art); goto find_mid; @@ -508,7 +508,7 @@ sub simple_body_write ($$) { sub set_art { my ($self, $art) = @_; - $self->{article} = $art if defined $art && $art =~ /\A\d+\z/; + $self->{article} = $art if defined $art && $art =~ /\A[0-9]+\z/; } sub _header ($) { @@ -576,11 +576,11 @@ sub get_range ($$) { defined $range or return '420 No article(s) selected'; my ($beg, $end); my ($min, $max) = $ng->mm->minmax; - if ($range =~ /\A(\d+)\z/) { + if ($range =~ /\A([0-9]+)\z/) { $beg = $end = $1; - } elsif ($range =~ /\A(\d+)-\z/) { + } elsif ($range =~ /\A([0-9]+)-\z/) { ($beg, $end) = ($1, $max); - } elsif ($range =~ /\A(\d+)-(\d+)\z/) { + } elsif ($range =~ /\A([0-9]+)-([0-9]+)\z/) { ($beg, $end) = ($1, $2); } else { return r501; -- cgit v1.2.3-24-ge0c7 From 53a7adb8aa292f032c44886f220c3e8ed5f93378 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:20:40 +0000 Subject: nntp: ensure we only handle ASCII whitespace RFC3977 does not have provisions for whitespace beyond ASCII TAB, SP, CR and LF. I doubt there's any NNTP clients broken enough to be sending non-ASCII whitespace delimiters. We're probably excessively liberal regarding TAB acceptance, even; but it's probably too late to change at this point... --- lib/PublicInbox/NNTP.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm index 57300e89..be80560f 100644 --- a/lib/PublicInbox/NNTP.pm +++ b/lib/PublicInbox/NNTP.pm @@ -121,7 +121,7 @@ sub args_ok ($$) { # returns 1 if we can continue, 0 if not due to buffered writes or disconnect sub process_line ($$) { my ($self, $l) = @_; - my ($req, @args) = split(/\s+/, $l); + my ($req, @args) = split(/[ \t]/, $l); return 1 unless defined($req); # skip blank line $req = lc($req); $req = eval { @@ -959,7 +959,7 @@ sub event_read { $self->{rbuf} .= $$buf; } my $r = 1; - while ($r > 0 && $self->{rbuf} =~ s/\A\s*([^\r\n]*)\r?\n//) { + while ($r > 0 && $self->{rbuf} =~ s/\A[ \t\r\n]*([^\r\n]*)\r?\n//) { my $line = $1; return $self->close if $line =~ /[[:cntrl:]]/s; my $t0 = now(); -- cgit v1.2.3-24-ge0c7 From 1f8065599d934b2af7d24773bb7d6901f9586945 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:23 +0000 Subject: mid: id_compress requires ASCII-clean words Its result is used for HTML anchors and such. --- lib/PublicInbox/MID.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm index 7f1ab15e..6904d61a 100644 --- a/lib/PublicInbox/MID.pm +++ b/lib/PublicInbox/MID.pm @@ -26,11 +26,11 @@ sub mid_clean { $mid; } -# this is idempotent +# this is idempotent, used for HTML anchor/ids and such sub id_compress { my ($id, $force) = @_; - if ($force || $id =~ /[^\w\-]/ || length($id) > MID_MAX) { + if ($force || $id =~ /[^a-zA-Z0-9_\-]/ || length($id) > MID_MAX) { utf8::encode($id); return sha1_hex($id); } -- cgit v1.2.3-24-ge0c7 From 010fbd95d3916e20960d0aacea7dfc53502ff5ed Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:22 +0000 Subject: feed: only accept ASCII digits for ref~$N We don't want to waste cycles passing non-ASCII characters to git. --- lib/PublicInbox/Feed.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index a04838a1..ae071895 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -102,7 +102,7 @@ sub recent_msgs { my $hex = '[a-f0-9]'; my $addmsg = qr!^:000000 100644 \S+ (\S+) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 (\S+) \S+ D\t(${hex}{2}/${hex}{38})$!; - my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~\d+)?/; + my $refhex = qr/(?:HEAD|${hex}{4,40})(?:~[0-9]+)?/; # revision ranges may be specified my $range = 'HEAD'; -- cgit v1.2.3-24-ge0c7 From fc17b626cf3b4425899ea5073621fbeb7f8be18c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:26 +0000 Subject: http: require SERVER_PORT to be ASCII digit I'm not sure what middlewares care for for SERVER_PORT; but allowing non-ASCII digits seems non-sensical, here. --- lib/PublicInbox/HTTP.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 10e6d6a4..977614b4 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -142,7 +142,7 @@ sub app_dispatch { $env->{REMOTE_ADDR} = $self->{remote_addr}; $env->{REMOTE_PORT} = $self->{remote_port}; if (my $host = $env->{HTTP_HOST}) { - $host =~ s/:(\d+)\z// and $env->{SERVER_PORT} = $1; + $host =~ s/:([0-9]+)\z// and $env->{SERVER_PORT} = $1; $env->{SERVER_NAME} = $host; } if (defined $input) { -- cgit v1.2.3-24-ge0c7 From fc483fde5dc78d7e6b230527e375c5f421997565 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:32:27 +0000 Subject: wwwlisting: require ASCII digit for port number We only care about the hostname portion for matching, so this change is probably inconsequential. --- lib/PublicInbox/WwwListing.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index e8dad4b8..e1473b3d 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -24,8 +24,8 @@ sub list_match_domain ($$) { my ($self, $env) = @_; my @list; my $host = $env->{HTTP_HOST} // $env->{SERVER_NAME}; - $host =~ s/:\d+\z//; - my $re = qr!\A(?:https?:)?//\Q$host\E(?::\d+)?/!i; + $host =~ s/:[0-9]+\z//; + my $re = qr!\A(?:https?:)?//\Q$host\E(?::[0-9]+)?/!i; $self->{pi_config}->each_inbox(sub { my ($ibx) = @_; push @list, $ibx if !$ibx->{-hide}->{www} && $ibx->{url} =~ $re; -- cgit v1.2.3-24-ge0c7 From f8c8ca04d47620d390092000ca09aab071442fac Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:27 +0000 Subject: wwwattach: only pass the charset through if ASCII AFAIK all names of charsets are ASCII, so passing non-ASCII characters from emails to clients would probably confuse clients. --- lib/PublicInbox/WwwAttach.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm index d690ce41..96103cb0 100644 --- a/lib/PublicInbox/WwwAttach.pm +++ b/lib/PublicInbox/WwwAttach.pm @@ -27,7 +27,7 @@ sub get_attach ($$$) { if ($ct && (($ct->{discrete} || '') eq 'text')) { # display all text as text/plain: my $cset = $ct->{attributes}->{charset}; - if ($cset && ($cset =~ /\A[\w-]+\z/)) { + if ($cset && ($cset =~ /\A[a-zA-Z0-9_\-]+\z/)) { $res->[1]->[1] .= qq(; charset=$cset); } } else { # TODO: allow user to configure safe types -- cgit v1.2.3-24-ge0c7 From c5621af43e9c7cb1ff0565aa61a1d8fced55a23b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:28 +0000 Subject: www: only emit ASCII chars in attachment filenames We don't want to emit funky URLs which can be lost in translation or cause problems with non-Unicode-aware clients. Then, don't accept non-ASCII filenames in URLs, since a manually-generated URL/filename in attachment downloads could be used for Unicode homographs to confuse folks who down the attachment. --- lib/PublicInbox/Hval.pm | 3 +++ lib/PublicInbox/View.pm | 2 +- lib/PublicInbox/WWW.pm | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 95a0f709..2b443970 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -13,6 +13,9 @@ our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape to_attr from_attr/; my $enc_ascii = find_encoding('us-ascii'); +# safe-ish acceptable filename pattern for portability +our $FN = '[a-zA-Z0-9][a-zA-Z0-9_\-\.]+[a-zA-Z0-9]'; # needs \z anchor + sub new { my ($class, $raw, $href) = @_; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 09afdaf1..83ae99bc 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -528,7 +528,7 @@ sub attach_link ($$$$;$) { $desc = $fn unless defined $desc; $desc = '' unless defined $desc; my $sfn; - if (defined $fn && $fn =~ /\A[[:alnum:]][\w\.-]+[[:alnum:]]\z/) { + if (defined $fn && $fn =~ /\A$PublicInbox::Hval::FN\z/o) { $sfn = $fn; } elsif ($ct eq 'text/plain') { $sfn = 'a.txt'; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index b6f18f8d..50b6950c 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -28,7 +28,7 @@ use PublicInbox::UserContent; our $INBOX_RE = qr!\A/([\w\-][\w\.\-]*)!; our $MID_RE = qr!([^/]+)!; our $END_RE = qr!(T/|t/|t\.mbox(?:\.gz)?|t\.atom|raw|)!; -our $ATTACH_RE = qr!(\d[\.\d]*)-([[:alnum:]][\w\.-]+[[:alnum:]])!i; +our $ATTACH_RE = qr!([0-9][0-9\.]*)-($PublicInbox::Hval::FN)!; our $OID_RE = qr![a-f0-9]{7,40}!; sub new { -- cgit v1.2.3-24-ge0c7 From bb64c28a4a2688171b7625e99ed72dd51a5ee074 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 09:02:01 +0000 Subject: www: require ASCII filenames in git blob downloads Our Hval::to_filename sub has always been strict about emitting ASCII-only characters for ViewVCS "raw" links. However, somebody could manually generate a filename with non-ASCII words for somebody else to download (we have no cheap and fast way of mapping filenames back to blobs for validation). --- lib/PublicInbox/WWW.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 50b6950c..7670224f 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -127,7 +127,8 @@ sub call { get_css($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/\z!o) { get_vcs_object($ctx, $1, $2); - } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/([\w\.\-]+)\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/ + ($PublicInbox::Hval::FN)\z!ox) { get_vcs_object($ctx, $1, $2, $3); } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s\z!o) { r301($ctx, $1, $2, 's/'); -- cgit v1.2.3-24-ge0c7 From 63636d78c9c1aba31c4141460b0012ffee96ff53 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:29 +0000 Subject: config: do not accept non-ASCII digits in cgitrc params cgit uses atoi(3), and now we can retain compatibility. --- lib/PublicInbox/Config.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 09f9179b..6e85750a 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -307,7 +307,7 @@ sub parse_cgitrc { } } elsif (m!\Ainclude=(.+)\z!) { parse_cgitrc($self, $1, $nesting + 1); - } elsif (m!\A(scan-hidden-path|remove-suffix)=(\d+)\z!) { + } elsif (m!\A(scan-hidden-path|remove-suffix)=([0-9]+)\z!) { my ($k, $v) = ($1, $2); $k =~ tr/-/_/; $self->{"-cgit_$k"} = $v; -- cgit v1.2.3-24-ge0c7 From af3d2366e9c2096bc927f7dce66cd024bc866b51 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:30 +0000 Subject: newswww: only accept ASCII digits as article numbers Non-ASCII digits aren't specified in RFC3977 for article numbers; so don't waste a trip to SQLite only to turn up empty. --- lib/PublicInbox/NewsWWW.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index 8626cf96..80bb4886 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -47,7 +47,7 @@ sub call { if (my $ibx = $pi_config->lookup_newsgroup($ng)) { my $url = PublicInbox::Hval::prurl($env, $ibx->{url}); my $code = 301; - if (defined $article && $article =~ /\A\d+\z/) { + if (defined $article && $article =~ /\A[0-9]+\z/) { my $mid = eval { $ibx->mm->mid_for($article) }; if (defined $mid) { # article IDs are not stable across clones, -- cgit v1.2.3-24-ge0c7 From a533d298a88688587311efc8d59c924a502667f9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:31 +0000 Subject: view: require YYYYmmDD(HHMMSS) timestamps to be ASCII Passing digits to `timegm' which it does not understand would be a waste of time. --- lib/PublicInbox/View.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 83ae99bc..1b52bf86 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -1160,8 +1160,8 @@ sub paginate_recent ($$) { # Xapian uses '..' but '-' is perhaps friendier to URL linkifiers # if only $after exists "YYYYMMDD.." because "." could be skipped # if interpreted as an end-of-sentence - $t =~ s/\A(\d{8,14})-// and $after = str2ts($1); - $t =~ /\A(\d{8,14})\z/ and $before = str2ts($1); + $t =~ s/\A([0-9]{8,14})-// and $after = str2ts($1); + $t =~ /\A([0-9]{8,14})\z/ and $before = str2ts($1); my $ibx = $ctx->{-inbox}; my $msgs = $ibx->recent($opts, $after, $before); -- cgit v1.2.3-24-ge0c7 From 3abd653020d6b4072759c83fb69c68b6e838aa09 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:32 +0000 Subject: githttpbackend: require Range:, Status: to be ASCII digits Non-ASCII digits would be interpreted as a zeroes as integers. While we're at it, ensure the Status: code is an ASCII digit, too; though I would not expect git-http-backend(1) or cgit(1) start spewing non-ASCII digits at us. --- lib/PublicInbox/GitHTTPBackend.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index 09411048..e871bdde 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -90,7 +90,7 @@ sub static_result ($$$$) { my $len = $size; my $code = 200; push @$h, 'Content-Type', $type; - if (($env->{HTTP_RANGE} || '') =~ /\bbytes=(\d*)-(\d*)\z/) { + if (($env->{HTTP_RANGE} || '') =~ /\bbytes=([0-9]*)-([0-9]*)\z/) { ($code, $len) = prepare_range($env, $in, $h, $1, $2, $size); if ($code == 416) { push @$h, 'Content-Range', "bytes */$size"; @@ -260,7 +260,7 @@ sub parse_cgi_headers { foreach my $l (split(/\r?\n/, $h)) { my ($k, $v) = split(/:\s*/, $l, 2); if ($k =~ /\AStatus\z/i) { - ($code) = ($v =~ /\b(\d+)\b/); + ($code) = ($v =~ /\b([0-9]+)\b/); } else { push @h, $k, $v; } -- cgit v1.2.3-24-ge0c7 From 7c29cce3cb92aeadc1ec589c96b36936e38fe8c1 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:33 +0000 Subject: searchview: do not allow non-ASCII offsets and limits Non-ASCII digits would be interpreted as zero when used as integers. --- lib/PublicInbox/SearchView.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 6592b3b2..b089de9c 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -308,12 +308,12 @@ sub new { my ($class, $qp) = @_; my $r = $qp->{r}; - my ($l) = (($qp->{l} || '') =~ /(\d+)/); + my ($l) = (($qp->{l} || '') =~ /([0-9]+)/); $l = $LIM if !$l || $l > $LIM; bless { q => $qp->{'q'}, x => $qp->{x} || '', - o => (($qp->{o} || '0') =~ /(\d+)/), + o => (($qp->{o} || '0') =~ /([0-9]+)/), l => $l, r => (defined $r && $r ne '0'), }, $class; -- cgit v1.2.3-24-ge0c7 From 3703c8265f9294b8cff7172b62c8b923bcd8791a Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:24 +0000 Subject: msgtime: require ASCII digits for parsing dates User input contains the darndest things. Don't waste more time than necessary trying to parse dates out of non-ASCII digits. --- lib/PublicInbox/MsgTime.pm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/MsgTime.pm b/lib/PublicInbox/MsgTime.pm index 62160233..12412825 100644 --- a/lib/PublicInbox/MsgTime.pm +++ b/lib/PublicInbox/MsgTime.pm @@ -44,8 +44,9 @@ sub msg_received_at ($) { my @recvd = $hdr->header_raw('Received'); my ($ts); foreach my $r (@recvd) { - $r =~ /\s*(\d+\s+[[:alpha:]]+\s+\d{2,4}\s+ - \d+\D\d+(?:\D\d+)\s+([\+\-]\d+))/sx or next; + $r =~ /\s*([0-9]+\s+[a-zA-Z]+\s+[0-9]{2,4}\s+ + [0-9]+[^0-9][0-9]+(?:[^0-9][0-9]+) + \s+([\+\-][0-9]+))/sx or next; $ts = eval { str2date_zone($1) } and return $ts; my $mid = $hdr->header_raw('Message-ID'); warn "no date in $mid Received: $r\n"; @@ -59,7 +60,7 @@ sub msg_date_only ($) { my ($ts); foreach my $d (@date) { # Y2K problems: 3-digit years - $d =~ s!([A-Za-z]{3}) (\d{3}) (\d\d:\d\d:\d\d)! + $d =~ s!([A-Za-z]{3}) ([0-9]{3}) ([0-9]{2}:[0-9]{2}:[0-9]{2})! my $yyyy = $2 + 1900; "$1 $yyyy $3"!e; $ts = eval { str2date_zone($d) } and return $ts; if ($@) { -- cgit v1.2.3-24-ge0c7 From ac4fbb3237e3fab8c4056cb595f6a2b677d8c16e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:25:10 +0000 Subject: filter/rubylang: require ASCII digit for mailcount Unlikely to matter, but who knows... --- lib/PublicInbox/Filter/RubyLang.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Filter/RubyLang.pm b/lib/PublicInbox/Filter/RubyLang.pm index a43d67a9..d40705b7 100644 --- a/lib/PublicInbox/Filter/RubyLang.pm +++ b/lib/PublicInbox/Filter/RubyLang.pm @@ -50,7 +50,7 @@ sub scrub { my @v = $hdr->header_raw('X-Mail-Count'); my $n; foreach (@v) { - /\A\s*(\d+)\s*\z/ or next; + /\A\s*([0-9]+)\s*\z/ or next; $n = $1; last; } -- cgit v1.2.3-24-ge0c7 From b5693a2107a1edd02cfad73f9302833e1b91d339 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:29:36 +0000 Subject: inbox: require ASCII digits for feedmax var Don't waste more cycles than necessary if somebody decides to put non-ASCII digits in their ~/.public-inbox/config --- lib/PublicInbox/Inbox.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index b3178b98..04d2f832 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -74,7 +74,7 @@ sub _set_uint ($$$) { my $val = $opts->{$field}; if (defined $val) { $val = $val->[-1] if ref($val) eq 'ARRAY'; - $val = undef if $val !~ /\A\d+\z/; + $val = undef if $val !~ /\A[0-9]+\z/; } $opts->{$field} = $val || $default; } -- cgit v1.2.3-24-ge0c7 From bbfa42a9ea55b7057c7a6b632f090763c9e7c655 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:30:55 +0000 Subject: solver|viewdiff: restrict digit matches to ASCII git would not generate non-ASCII digits to describe hunk offsets, so don't waste more time than necessary to make sense of non-ASCII digit chars for line offsets. --- lib/PublicInbox/SolverGit.pm | 2 +- lib/PublicInbox/ViewDiff.pm | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 3841c567..81f99025 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -206,7 +206,7 @@ sub find_extract_diff ($$$) { } my $msgs = $srch->query($q, { relevance => 1 }); - my $re = qr/\Aindex ($pre[a-f0-9]*)\.\.($post[a-f0-9]*)(?: (\d+))?/; + my $re = qr/\Aindex ($pre[a-f0-9]*)\.\.($post[a-f0-9]*)(?: ([0-9]+))?/; my $di; foreach my $smsg (@$msgs) { diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index 411ed2bb..b7dab819 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -55,12 +55,12 @@ sub diff_hunk ($$$$) { (defined($spfx) && defined($oid_a) && defined($oid_b)) or return "@@ $ca $cb @@"; - my ($n) = ($ca =~ /^-(\d+)/); + my ($n) = ($ca =~ /^-([0-9]+)/); $n = defined($n) ? do { ++$n; "#n$n" } : ''; my $rv = qq(@@ {Q}$n">$ca); - ($n) = ($cb =~ /^\+(\d+)/); + ($n) = ($cb =~ /^\+([0-9]+)/); $n = defined($n) ? do { ++$n; "#n$n" } : ''; $rv .= qq( {Q}$n">$cb @@); -- cgit v1.2.3-24-ge0c7 From b04bdc8cd749dd3dfcc9351b2b47bfdf190b4a3a Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:36:18 +0000 Subject: www: require ASCII digit for git epoch Don't inadvertantly serve git repos containing non-ASCII digit characters. --- lib/PublicInbox/WWW.pm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 7670224f..b0fad7fe 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -74,7 +74,8 @@ sub call { my $method = $env->{REQUEST_METHOD}; if ($method eq 'POST') { - if ($path_info =~ m!$INBOX_RE/(?:(\d+)/)?(git-upload-pack)\z!) { + if ($path_info =~ m!$INBOX_RE/(?:([0-9]+)/)? + (git-upload-pack)\z!x) { my ($part, $path) = ($2, $3); return invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); @@ -97,7 +98,7 @@ sub call { invalid_inbox($ctx, $1) || get_atom($ctx); } elsif ($path_info =~ m!$INBOX_RE/new\.html\z!o) { invalid_inbox($ctx, $1) || get_new($ctx); - } elsif ($path_info =~ m!$INBOX_RE/(?:(\d+)/)? + } elsif ($path_info =~ m!$INBOX_RE/(?:([0-9]+)/)? ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my ($part, $path) = ($2, $3); invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); -- cgit v1.2.3-24-ge0c7 From d3906fed88f403552d1629e9ecc9974ab85abaae Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:40:34 +0000 Subject: require ASCII digits for local FS items In case some BOFH decides to randomly create directories using non-ASCII digits all over the place. --- lib/PublicInbox/Inbox.pm | 4 ++-- lib/PublicInbox/Search.pm | 2 +- lib/PublicInbox/V2Writable.pm | 4 ++-- lib/PublicInbox/Xapcmd.pm | 6 +++--- script/public-inbox-purge | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 04d2f832..c9330332 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -87,7 +87,7 @@ sub _set_limiter ($$$) { my $mkey = $pfx.'max'; my $val = $self->{$mkey} or return; my $lim; - if ($val =~ /\A\d+\z/) { + if ($val =~ /\A[0-9]+\z/) { require PublicInbox::Qspawn; $lim = PublicInbox::Qspawn::Limiter->new($val); } elsif ($val =~ /\A[a-z][a-z0-9]*\z/) { @@ -161,7 +161,7 @@ sub max_git_part { if (opendir my $dh, $gits) { my $max = -1; while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A(\d+)\.git\z! or next; + $git_dir =~ m!\A([0-9]+)\.git\z! or next; $max = $1 if $1 > $max; } $part = $self->{-max_git_part} = $max if $max >= 0; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index c054a874..9903f427 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -144,7 +144,7 @@ sub _xdb ($) { my $qpf = \($self->{qp_flags} ||= $QP_FLAGS); if ($self->{version} >= 2) { foreach my $part (<$dir/*>) { - -d $part && $part =~ m!/\d+\z! or next; + -d $part && $part =~ m!/[0-9]+\z! or next; my $sub = Search::Xapian::Database->new($part); if ($xdb) { $xdb->add_database($sub); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 76844cd4..a8c33ef4 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -48,7 +48,7 @@ sub count_partitions ($) { # due to -compact if (-d $xpfx) { foreach my $part (<$xpfx/*>) { - -d $part && $part =~ m!/\d+\z! or next; + -d $part && $part =~ m!/[0-9]+\z! or next; eval { Search::Xapian::Database->new($part)->close; $nparts++; @@ -574,7 +574,7 @@ sub git_dir_latest { my $latest; opendir my $dh, $pfx or die "opendir $pfx: $!\n"; while (defined(my $git_dir = readdir($dh))) { - $git_dir =~ m!\A(\d+)\.git\z! or next; + $git_dir =~ m!\A([0-9]+)\.git\z! or next; if ($1 > $$max) { $$max = $1; $latest = "$pfx/$git_dir"; diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 90672310..dad080c8 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -150,7 +150,7 @@ sub run { } else { opendir my $dh, $old or die "Failed to opendir $old: $!\n"; while (defined(my $dn = readdir($dh))) { - if ($dn =~ /\A\d+\z/) { + if ($dn =~ /\A[0-9]+\z/) { my $tmpl = "$dn-XXXXXXXX"; my $dst = tempdir($tmpl, DIR => $old); same_fs_or_die($old, $dst); @@ -200,7 +200,7 @@ sub progress_pfx ($) { my @p = split('/', $_[0]); # return "xap15/0" for v2, or "xapian15" for v1: - ($p[-1] =~ /\A\d+\z/) ? "$p[-2]/$p[-1]" : $p[-1]; + ($p[-1] =~ /\A[0-9]+\z/) ? "$p[-2]/$p[-1]" : $p[-1]; } # xapian-compact wrapper @@ -276,7 +276,7 @@ sub cpdb ($$) { $dst->set_metadata('last_commit', $lc) if $lc; # only the first xapian partition (0) gets 'indexlevel' - if ($old =~ m!(?:xapian\d+|xap\d+/0)\z!) { + if ($old =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\z!) { my $l = $src->get_metadata('indexlevel'); if ($l eq 'medium') { $dst->set_metadata('indexlevel', $l); diff --git a/script/public-inbox-purge b/script/public-inbox-purge index 381826dc..25e6cc9b 100755 --- a/script/public-inbox-purge +++ b/script/public-inbox-purge @@ -91,7 +91,7 @@ foreach my $ibx (@inboxes) { my $xdir_ro = $ibx->{search}->xdir(1); my $npart = 0; foreach my $part (<$xdir_ro/*>) { - if (-d $part && $part =~ m!/\d+\z!) { + if (-d $part && $part =~ m!/[0-9]+\z!) { my $bytes = 0; $bytes += -s $_ foreach glob("$part/*"); $npart++ if $bytes; -- cgit v1.2.3-24-ge0c7 From 7a8c54a0a90fce4c965b05769e10182388a84c31 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 08:58:32 +0000 Subject: githttpbackend: require ASCII in path We mainly support git-upload-pack; and maybe somebody uses git-receive-pack with this. Perhaps other (experimental) command names are acceptable. But it's unlikely anybody will want Unicode command names for git services. --- lib/PublicInbox/GitHTTPBackend.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index e871bdde..a2a81f8e 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -51,8 +51,8 @@ sub serve { # Documentation/technical/http-protocol.txt in git.git # requires one and exactly one query parameter: - if ($env->{QUERY_STRING} =~ /\Aservice=git-\w+-pack\z/ || - $path =~ /\Agit-\w+-pack\z/) { + if ($env->{QUERY_STRING} =~ /\Aservice=git-[A-Za-z0-9_]+-pack\z/ || + $path =~ /\Agit-[A-Za-z0-9_]+-pack\z/) { my $ok = serve_smart($env, $git, $path); return $ok if $ok; } -- cgit v1.2.3-24-ge0c7 From aedd4d6d205a4e9ae6d1d81fd011fb2f896be41b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 09:05:51 +0000 Subject: www: require ASCII range for mbox downloads We do not support many mboxrd download range specifications at the moment; but parsing non-ASCII characters isn't planned. This makes no difference aside from being able to return 404 slightly earlier than we would've in the past. --- lib/PublicInbox/WWW.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index b0fad7fe..f41f98ed 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -102,7 +102,7 @@ sub call { ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { my ($part, $path) = ($2, $3); invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); - } elsif ($path_info =~ m!$INBOX_RE/([\w-]+).mbox\.gz\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/([a-zA-Z0-9_\-]+).mbox\.gz\z!o) { serve_mbox_range($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$END_RE\z!o) { msg_page($ctx, $1, $2, $3); -- cgit v1.2.3-24-ge0c7 From 91af69a41f2963f1f952cb0932ed23cd86cd1093 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 10:19:34 +0000 Subject: www: require ASCII word characters for CSS filenames Allowing admins to set non-ASCII CSS filenames could cause unnecessary problems for client and proxies. --- lib/PublicInbox/WWW.pm | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index f41f98ed..7ea98204 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -124,7 +124,7 @@ sub call { r301($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/_/text(?:/(.*))?\z!o) { get_text($ctx, $1, $2); - } elsif ($path_info =~ m!$INBOX_RE/([\w\-\.]+)\.css\z!o) { + } elsif ($path_info =~ m!$INBOX_RE/([a-zA-Z0-9_\-\.]+)\.css\z!o) { get_css($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/($OID_RE)/s/\z!o) { get_vcs_object($ctx, $1, $2); @@ -536,11 +536,15 @@ sub stylesheets_prepare ($$) { $inline_ok = 0; } else { my $fn = $_; + my ($key) = (m!([^/]+?)(?:\.css)?\z!i); + if ($key !~ /\A[a-zA-Z0-9_\-\.]+\z/) { + warn "ignoring $fn, non-ASCII word character\n"; + next; + } open(my $fh, '<', $fn) or do { warn "failed to open $fn: $!\n"; next; }; - my ($key) = (m!([^/]+?)(?:\.css)?\z!i); my $ctime = 0; my $local = do { local $/; <$fh> }; if ($local =~ /\S/) { -- cgit v1.2.3-24-ge0c7