Fix Net::Cmd::datasend() for octets stored in an upgraded string

The data passed to datasend() should already be encoded, but it can sometimes happen that the string holding the octets gets accidentally upgraded and it was wrong for datasend() to treat it differently in that case. Fixes CPAN RT#104433. Many thanks to Ricardo and Aristotle for their help on the ticket.
author: Steve Hay <steve.m.hay@googlemail.com> 2015-07-16 09:07:19 +0100
committer: Steve Hay <steve.m.hay@googlemail.com> 2015-07-16 09:07:19 +0100
commit: 20056b26e77c3a0874195d8286538e83ff950004 (patch)
tree: 03a69542fa058eeed14dc2cc2c10c854f772abe5
parent: 975f3f66d1a24a9d2d2e69da00cb385eb114c1cb (diff)
download: perl-libnet-20056b26e77c3a0874195d8286538e83ff950004.tar.gz
7 files changed, 74 insertions, 33 deletions
diff --git a/Changes b/Changes
index bf24244..4c45615 100644
--- a/Changes
+++ b/Changes
@@ -2,7 +2,35 @@ Revision history for Perl distribution libnet
  
  3.07 Development
  
-    - TODO
+    - Fixed a bug in Net::Cmd::datasend() which caused octets in [\x80-\xFF]
+      stored in a "binary string" to be replaced with their UTF-8 encodings if
+      the string happened to be stored internally in an "upgraded" state (i.e.
+      with the UTF-8 flag on). (As noted below, strings passed to datasend()
+      should always be encoded first, and therefore not stored in such a state
+      anyway, but it is all too easy for perl to change this internal state
+      unless the encodeing is done at the very last minute before calling
+      datasend(), so it helps if datasend() plays more nicely in this case. In
+      particular, it was wrong of datasend() to treat upgraded and downgraded
+      strings differently when their contents were identical at the Perl level.)
+
+      This bugfix results in a breaking change to the case of a "text string"
+      with characters in U+0080..U+00FF stored internally in an upgraded state
+      since those characters are likewise no longer encoded to UTF-8 by
+      datasend(), but callers of datasend() should not have been relying on this
+      behaviour anyway: In general, datasend() has no idea what encoding is
+      required for output so callers should always encode the data to be output
+      to whatever encoding is required first. This has now been clarified in the
+      documentation.
+
+      Finally, a text string with characters >= U+0100 will now cause a "Wide
+      character in print" warning from datasend() since such characters cannot
+      be output as bytes and datasend() no longer encodes to UTF-8. In this
+      case, UTF-8 bytes will still be output as before since that happens to be
+      the internal representation of such characters, but the warning is new.
+      Callers should heed this warning and encode such strings to whatever
+      encoding is required before calling datasend(), as noted above.
+
+      [Ricardo Signes, CPAN RT#104433]
  
  3.06 2015-04-01
  
diff --git a/Makefile.PL b/Makefile.PL
index 7db02c5..872eac8 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -7,7 +7,7 @@
  #   Makefile creation script.
  #
  # COPYRIGHT
-#   Copyright (C) 2014 Steve Hay.  All rights reserved.
+#   Copyright (C) 2014, 2015 Steve Hay.  All rights reserved.
  #
  # LICENCE
  #   This script is free software; you can redistribute it and/or modify it under
@@ -206,6 +206,7 @@ MAIN: {
              'Time::Local'    => '0',
              'constant'       => '0',
              'strict'         => '0',
+            'utf8'           => '0',
              'vars'           => '0'
          },
  
diff --git a/lib/Net/Cmd.pm b/lib/Net/Cmd.pm
index cec44bf..3bf5ec6 100644
--- a/lib/Net/Cmd.pm
+++ b/lib/Net/Cmd.pm
@@ -2,7 +2,7 @@
  #
  # Versions up to 2.29_1 Copyright (c) 1995-2006 Graham Barr <gbarr@pobox.com>.
  # All rights reserved.
-# Changes in Version 2.29_2 onwards Copyright (C) 2013-2014 Steve Hay.  All
+# Changes in Version 2.29_2 onwards Copyright (C) 2013-2015 Steve Hay.  All
  # rights reserved.
  # This module is free software; you can redistribute it and/or modify it under
  # the same terms as Perl itself, i.e. under the terms of either the GNU General
@@ -27,21 +27,6 @@ BEGIN {
    }
  }
  
-BEGIN {
-  if (!eval { require utf8 }) {
-    *is_utf8 = sub { 0 };
-  }
-  elsif (eval { utf8::is_utf8(undef); 1 }) {
-    *is_utf8 = \&utf8::is_utf8;
-  }
-  elsif (eval { require Encode; Encode::is_utf8(undef); 1 }) {
-    *is_utf8 = \&Encode::is_utf8;
-  }
-  else {
-    *is_utf8 = sub { $_[0] =~ /[^\x00-\xff]/ };
-  }
-}
-
  our $VERSION = "3.07";
  our @ISA     = qw(Exporter);
  our @EXPORT  = qw(CMD_INFO CMD_OK CMD_MORE CMD_REJECT CMD_ERROR CMD_PENDING);
@@ -429,9 +414,17 @@ sub datasend {
    my $arr  = @_ == 1 && ref($_[0]) ? $_[0] : \@_;
    my $line = join("", @$arr);
  
-  # encode to individual utf8 bytes if
-  # $line is a string (in internal UTF-8)
-  utf8::encode($line) if is_utf8($line);
+  # Perls < 5.10.1 (with the exception of 5.8.9) have a performance problem with
+  # the substitutions below when dealing with strings stored internally in
+  # UTF-8, so downgrade them (if possible).
+  # Data passed to datasend() should be encoded to octets upstream already so
+  # shouldn't even have the UTF-8 flag on to start with, but if it so happens
+  # that the octets are stored in an upgraded string (as can sometimes occur)
+  # then they would still downgrade without fail anyway.
+  # Only Unicode codepoints > 0xFF stored in an upgraded string will fail to
+  # downgrade. We fail silently in that case, and a "Wide character in print"
+  # warning will be emitted later by syswrite().
+  utf8::downgrade($line, 1) if $] < 5.010001 && $] != 5.008009;
  
    return 0
      if $cmd->_is_closed;
@@ -722,6 +715,8 @@ is pending then C<CMD_PENDING> is returned.
  Send data to the remote server, converting LF to CRLF. Any line starting
  with a '.' will be prefixed with another '.'.
  C<DATA> may be an array or a reference to an array.
+The C<DATA> passed in must be encoded by the caller to octets of whatever
+encoding is required, e.g. by using the Encode module's C<encode()> function.
  
  =item dataend ()
  
@@ -794,6 +789,9 @@ Unget a line of text from the server.
  
  Send data to the remote server without performing any conversions. C<DATA>
  is a scalar.
+As with C<datasend()>, the C<DATA> passed in must be encoded by the caller
+to octets of whatever encoding is required, e.g. by using the Encode module's
+C<encode()> function.
  
  =item read_until_dot ()
  
diff --git a/lib/Net/NNTP.pm b/lib/Net/NNTP.pm
index 120292c..b9c5d6f 100644
--- a/lib/Net/NNTP.pm
+++ b/lib/Net/NNTP.pm
@@ -2,7 +2,7 @@
  #
  # Versions up to 2.24_1 Copyright (c) 1995-1997 Graham Barr <gbarr@pobox.com>.
  # All rights reserved.
-# Changes in Version 2.25 onwards Copyright (C) 2013-2014 Steve Hay.  All rights
+# Changes in Version 2.25 onwards Copyright (C) 2013-2015 Steve Hay.  All rights
  # reserved.
  # This module is free software; you can redistribute it and/or modify it under
  # the same terms as Perl itself, i.e. under the terms of either the GNU General
@@ -947,15 +947,17 @@ implementation) from the server. Returns the text or undef upon failure.
  
  The C<ihave> command informs the server that the client has an article
  whose id is C<MSGID>.  If the server desires a copy of that
-article, and C<MESSAGE> has been given the it will be sent.
+article and C<MESSAGE> has been given then it will be sent.
  
  Returns I<true> if the server desires the article and C<MESSAGE> was
-successfully sent,if specified.
+successfully sent, if specified.
  
  If C<MESSAGE> is not specified then the message must be sent using the
  C<datasend> and C<dataend> methods from L<Net::Cmd>
  
-C<MESSAGE> can be either an array of lines or a reference to an array.
+C<MESSAGE> can be either an array of lines or a reference to an array
+and must be encoded by the caller to octets of whatever encoding is required,
+e.g. by using the Encode module's C<encode()> function.
  
  =item last ()
  
@@ -1028,7 +1030,9 @@ is allowed then the message will be sent.
  If C<MESSAGE> is not specified then the message must be sent using the
  C<datasend> and C<dataend> methods from L<Net::Cmd>
  
-C<MESSAGE> can be either an array of lines or a reference to an array.
+C<MESSAGE> can be either an array of lines or a reference to an array
+and must be encoded by the caller to octets of whatever encoding is required,
+e.g. by using the Encode module's C<encode()> function.
  
  The message, either sent via C<datasend> or as the C<MESSAGE>
  parameter, must be in the format as described by RFC822 and must
diff --git a/lib/Net/SMTP.pm b/lib/Net/SMTP.pm
index e200e37..573d8ea 100644
--- a/lib/Net/SMTP.pm
+++ b/lib/Net/SMTP.pm
@@ -2,7 +2,7 @@
  #
  # Versions up to 2.31_1 Copyright (c) 1995-2004 Graham Barr <gbarr@pobox.com>.
  # All rights reserved.
-# Changes in Version 2.31_2 onwards Copyright (C) 2013-2014 Steve Hay.  All
+# Changes in Version 2.31_2 onwards Copyright (C) 2013-2015 Steve Hay.  All
  # rights reserved.
  # This module is free software; you can redistribute it and/or modify it under
  # the same terms as Perl itself, i.e. under the terms of either the GNU General
@@ -942,9 +942,12 @@ Synonyms for C<recipient>.
  
  Initiate the sending of the data from the current message. 
  
-C<DATA> may be a reference to a list or a list. If specified the contents
-of C<DATA> and a termination string C<".\r\n"> is sent to the server. And the
-result will be true if the data was accepted.
+C<DATA> may be a reference to a list or a list and must be encoded by the
+caller to octets of whatever encoding is required, e.g. by using the Encode
+module's C<encode()> function.
+
+If specified the contents of C<DATA> and a termination string C<".\r\n"> is
+sent to the server. The result will be true if the data was accepted.
  
  If C<DATA> is not specified then the result will indicate that the server
  wishes the data to be sent. The data must then be sent using the C<datasend>
diff --git a/t/datasend.t b/t/datasend.t
index 3a97c4b..0aea9d4 100644
--- a/t/datasend.t
+++ b/t/datasend.t
@@ -44,7 +44,7 @@ BEGIN {
  (my $libnet_t = __FILE__) =~ s/datasend.t/libnet_t.pl/;
  require $libnet_t or die;
  
-print "1..51\n";
+print "1..54\n";
  
  sub check {
    my $expect = pop;
@@ -158,3 +158,10 @@ check(
    "a\015\012..\015\012.\015\012",
  );
  
+# Test that datasend() plays nicely with bytes in an upgraded string,
+# even though the input should really be encode()d already.
+check(
+  substr("\x{100}", 0, 0) . "\x{e9}",
+
+  "\x{e9}\015\012.\015\012"
+);
diff --git a/t/pod_coverage.t b/t/pod_coverage.t
index 9cb64c2..3d674d4 100644
--- a/t/pod_coverage.t
+++ b/t/pod_coverage.t
@@ -7,7 +7,7 @@
  #   Test script to check POD coverage.
  #
  # COPYRIGHT
-#   Copyright (C) 2014 Steve Hay.  All rights reserved.
+#   Copyright (C) 2014, 2015 Steve Hay.  All rights reserved.
  #
  # LICENCE
  #   This script is free software; you can redistribute it and/or modify it under
@@ -48,7 +48,7 @@ MAIN: {
          my $params = { coverage_class => qw(Pod::Coverage::CountParents) };
          pod_coverage_ok('Net::Cmd', {
              %$params,
-            also_private => [qw(is_utf8 toascii toebcdic set_status)]
+            also_private => [qw(toascii toebcdic set_status)]
          });
          pod_coverage_ok('Net::Config', {
              %$params,
author	Steve Hay <steve.m.hay@googlemail.com>	2015-07-16 09:07:19 +0100
committer	Steve Hay <steve.m.hay@googlemail.com>	2015-07-16 09:07:19 +0100
commit	20056b26e77c3a0874195d8286538e83ff950004 (patch)
tree	03a69542fa058eeed14dc2cc2c10c854f772abe5
parent	975f3f66d1a24a9d2d2e69da00cb385eb114c1cb (diff)
download	perl-libnet-20056b26e77c3a0874195d8286538e83ff950004.tar.gz