about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2019-12-12 18:58:24 +0000
committerEric Wong <e@80x24.org>2019-12-12 19:49:13 +0000
commit9910c591caa29efe08645ff5dec26566803daa7c (patch)
treedac49e7c33d831642db63c225c69a5ae42907e90
parent597e4958b90d16bc945436b9acfc55ed41e71f7a (diff)
downloadolddoc-9910c591caa29efe08645ff5dec26566803daa7c.tar.gz
The man2html(1) and groff(1) HTML generation isn't compatible
anchor-wise with what pandoc(1) generated.  They also had too
much styling for my liking.
-rwxr-xr-xbin/olddoc4
-rw-r--r--lib/olddoc.rb1
-rw-r--r--lib/olddoc/man2html.rb149
3 files changed, 153 insertions, 1 deletions
diff --git a/bin/olddoc b/bin/olddoc
index f7e80fb..986cc87 100755
--- a/bin/olddoc
+++ b/bin/olddoc
@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-# Copyright (C) 2015, all contributors <olddoc-public@80x24.org>
+# Copyright (C) 2015,2019 all contributors <olddoc-public@80x24.org>
 $stderr.sync = $stdout.sync = true
 tasks = %w(prepare merge)
 usage = "Usage: #{File.basename($0)} [#{tasks.join('|')}]"
@@ -10,6 +10,8 @@ when "prepare"
   Olddoc::Prepare.new(opts).run
 when "merge"
   Olddoc::Merge.new(opts).run
+when "man2html"
+  Olddoc::Man2HTML.new(opts).run(ARGV[1..-1])
 else
   warn "#{$0.inspect} #{ARGV.inspect} not understood"
   abort usage
diff --git a/lib/olddoc.rb b/lib/olddoc.rb
index d5d6b37..e4cd344 100644
--- a/lib/olddoc.rb
+++ b/lib/olddoc.rb
@@ -8,6 +8,7 @@ module Olddoc # :nodoc:
   autoload :NewsRdoc, 'olddoc/news_rdoc'
   autoload :Prepare, 'olddoc/prepare'
   autoload :Readme, 'olddoc/readme'
+  autoload :Man2HTML, 'olddoc/man2html'
 
   def self.config(path = ".olddoc.yml")
     File.readable?(path) and return YAML.load(File.read(path))
diff --git a/lib/olddoc/man2html.rb b/lib/olddoc/man2html.rb
new file mode 100644
index 0000000..82254d2
--- /dev/null
+++ b/lib/olddoc/man2html.rb
@@ -0,0 +1,149 @@
+# Copyright (C) 2019 all contributors <olddoc-public@80x24.org>
+# License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+# frozen_string_literal: true
+require 'digest'
+require 'optparse'
+
+# linkifier for manpages rendered to a terminal.  man2html(1) and
+# groff generate too much style
+
+class Olddoc::Man2HTML # :nodoc:
+  SALT = rand
+  LINK_RE = %r{([\('!])?\b((?:ftps?|https?|nntps?|gopher)://
+     [\@:\w\.-]+(?:/
+     (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
+     (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
+     (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
+     )?
+    )}xi
+
+  PAIRS = {
+  "(" => %r/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
+  "'" => %r/('[\.,;\+]?)\z/, # Perl / Ruby
+  "!" => %r/(![\.,;\+]?)\z/, # Perl / Ruby
+  }
+
+  def initialize(opts) # :nodoc:
+  end
+
+  def run(argv) # :nodoc:
+    out = $stdout
+    OptionParser.new("", 24, '  ') do |opts|
+      opts.on('-o', '--output PATH', 'output to given file') { |path|
+        out = File.open(path, 'w')
+      }
+      opts.parse!(argv)
+    end
+    argv[0] or abort 'manpage required'
+    cols = '72'
+    env = ENV.to_hash
+    env.merge!({ 'COLUMNS' => cols, 'MANWIDTH' => cols, 'TERM' => 'dumb' })
+
+    # note: I don't care for the styles groff and man2html throw
+    # on us, I just want indented and wrapped text with <a hrefs>
+    # for URLs.
+
+    # try man-db options, first:
+    str = IO.popen(env, ['man', '--nh', '--nj', *argv], &:read)
+
+    if str.empty? || !$?.success?
+      str = IO.popen(env, ['man', *argv], &:read)
+    end
+    if $?.success?
+      sections = '[A-Z][A-Z ]+'
+      str = str.split(/^(#{sections})$/mo)
+
+      str = str.map! do |s|
+        case s
+        when /\A(#{sections})$/o
+          # this is to be compatible with HTML fragments pandoc used
+          sec = $1
+          anchor = sec.downcase.tr(' ', '-')
+          "<h1\nid=#{anchor.encode(xml: :attr)}>#{sec}</h1>"
+        else
+          state = linkify_1(s)
+          s.encode!(xml: :text)
+          linkify_2(state, s)
+          s.rstrip!
+          s.empty? ? '' : "<pre>#{s}</pre>"
+        end
+      end.join
+
+      out.print(str)
+
+      # use mtime of the original source
+      if out.respond_to?(:path)
+        path = out.path
+        out.close
+        stat = src_input_stat(argv)
+        File.utime(stat.atime, stat.mtime, path) if stat
+      end
+    end
+  end
+
+  def src_input_stat(argv)
+    argv.reverse_each do |f|
+      next unless File.file?(f)
+      return File.stat(f)
+    end
+
+    argv.reverse_each do |f|
+      path = IO.popen(%W(man -w #{f}), &:read)
+      path.chomp!
+      next unless File.file?(path)
+      return File.stat(path)
+    end
+    nil
+  end
+
+  def linkify_1(str) # :nodoc:
+    state = {}
+    str.gsub!(LINK_RE) do
+      head = $1 || ''
+      url = $2.dup
+      tail = ''.dup
+
+      # it's fairly common to end URLs in messages with
+      # '.', ',' or ';' to denote the end of a statement;
+      # assume the intent was to end the statement/sentence
+      # in English
+      if re = PAIRS[head]
+        url.sub!(re, '')
+        tail = $1
+      elsif url.sub!(/(\))?([\.,;])\z/, '')
+        tail = $2
+        # require ')' to be paired with '('
+        if $1  # ')'
+          if url.index('(').nil?
+            tail = ")#{tail}"
+          else
+            url += ')'
+          end
+        end
+      elsif url !~ /\(/ && url.sub!(/\)\z/, '')
+        tail = ')'
+      end
+
+      # salt this, as this could be exploited to show
+      # links in the HTML which don't show up in the raw mail.
+      key = Digest::MD5.hexdigest("#{url}#{SALT}").freeze
+      state[key] = url
+      "#{head}OLD-LINK-#{key}#{tail}"
+    end
+    state
+  end
+
+  def linkify_2(state, str) # :nodoc:
+    # Added "OLD-LINK-" prefix to avoid false-positives on git commits
+    str.gsub!(/\bOLD-LINK-([a-f0-9]{32})\b/) do
+      key = $1
+      url = state[key]
+      if url
+        %Q{<a\nhref=#{url.encode(xml: :attr)}>#{url.encode(xml: :text)}</a>}
+      else
+        # false positive or somebody tried to mess with us
+        key
+      end
+    end
+  end
+end