From 1735e5c2cf87b28b096ad91008bdb764d853b26d Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 4 Jun 2019 02:04:21 +0000 Subject: linkify: support Internationalized Domain Names in URLs The "\w" character class in Perl matches any word characters in the Unicode database, not just ASCII characters. So we must be prepared for that and generate links to IDNs. --- lib/PublicInbox/Linkify.pm | 5 +++-- t/linkify.t | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index d4778e7d..84960a98 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -13,6 +13,7 @@ package PublicInbox::Linkify; use strict; use warnings; use Digest::SHA qw/sha1_hex/; +use PublicInbox::Hval qw(ascii_html); my $SALT = rand; my $LINK_RE = qr{([\('!])?\b((?:ftps?|https?|nntps?|gopher):// @@ -61,12 +62,12 @@ sub linkify_1 { $end = ')'; } + $url = ascii_html($url); # for IDN + # salt this, as this could be exploited to show # links in the HTML which don't show up in the raw mail. my $key = sha1_hex($url . $SALT); - # only escape ampersands, others do not match LINK_RE - $url =~ s/&/&/g; $_[0]->{$key} = $url; $beg . 'PI-LINK-'. $key . $end; ^ge; diff --git a/t/linkify.t b/t/linkify.t index fe218b91..c4923582 100644 --- a/t/linkify.t +++ b/t/linkify.t @@ -132,4 +132,16 @@ use PublicInbox::Linkify; 'punctuation with unpaired ) OK') } +if ('IDN example: ') { + my $hc = '月'; + my $u = "http://www.\x{6708}.example.com/"; + my $s = $u; + my $l = PublicInbox::Linkify->new; + $s = $l->linkify_1($s); + $s = $l->linkify_2($s); + my $expect = qq{http://www.$hc.example.com/}; + is($s, $expect, 'IDN message escaped properly'); +} + done_testing(); -- cgit v1.2.3-24-ge0c7