From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00, URIBL_BLOCKED shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 99A761F463 for ; Thu, 12 Dec 2019 20:01:14 +0000 (UTC) From: Eric Wong To: Subject: [PATCH] add man2html generator Date: Thu, 12 Dec 2019 20:01:14 +0000 Message-Id: <20191212200114.11738-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: The man2html(1) and groff(1) HTML generation isn't compatible anchor-wise with what pandoc(1) generated. They also had too much styling for my liking. --- bin/olddoc | 4 +- lib/olddoc.rb | 1 + lib/olddoc/man2html.rb | 149 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 lib/olddoc/man2html.rb diff --git a/bin/olddoc b/bin/olddoc index f7e80fb..986cc87 100755 --- a/bin/olddoc +++ b/bin/olddoc @@ -1,5 +1,5 @@ #!/usr/bin/env ruby -# Copyright (C) 2015, all contributors +# Copyright (C) 2015,2019 all contributors $stderr.sync = $stdout.sync = true tasks = %w(prepare merge) usage = "Usage: #{File.basename($0)} [#{tasks.join('|')}]" @@ -10,6 +10,8 @@ when "prepare" Olddoc::Prepare.new(opts).run when "merge" Olddoc::Merge.new(opts).run +when "man2html" + Olddoc::Man2HTML.new(opts).run(ARGV[1..-1]) else warn "#{$0.inspect} #{ARGV.inspect} not understood" abort usage diff --git a/lib/olddoc.rb b/lib/olddoc.rb index d5d6b37..e4cd344 100644 --- a/lib/olddoc.rb +++ b/lib/olddoc.rb @@ -8,6 +8,7 @@ module Olddoc # :nodoc: autoload :NewsRdoc, 'olddoc/news_rdoc' autoload :Prepare, 'olddoc/prepare' autoload :Readme, 'olddoc/readme' + autoload :Man2HTML, 'olddoc/man2html' def self.config(path = ".olddoc.yml") File.readable?(path) and return YAML.load(File.read(path)) diff --git a/lib/olddoc/man2html.rb b/lib/olddoc/man2html.rb new file mode 100644 index 0000000..82254d2 --- /dev/null +++ b/lib/olddoc/man2html.rb @@ -0,0 +1,149 @@ +# Copyright (C) 2019 all contributors +# License: GPL-3.0+ +# frozen_string_literal: true +require 'digest' +require 'optparse' + +# linkifier for manpages rendered to a terminal. man2html(1) and +# groff generate too much style + +class Olddoc::Man2HTML # :nodoc: + SALT = rand + LINK_RE = %r{([\('!])?\b((?:ftps?|https?|nntps?|gopher):// + [\@:\w\.-]+(?:/ + (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) + (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? + (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? + )? + )}xi + + PAIRS = { + "(" => %r/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays) + "'" => %r/('[\.,;\+]?)\z/, # Perl / Ruby + "!" => %r/(![\.,;\+]?)\z/, # Perl / Ruby + } + + def initialize(opts) # :nodoc: + end + + def run(argv) # :nodoc: + out = $stdout + OptionParser.new("", 24, ' ') do |opts| + opts.on('-o', '--output PATH', 'output to given file') { |path| + out = File.open(path, 'w') + } + opts.parse!(argv) + end + argv[0] or abort 'manpage required' + cols = '72' + env = ENV.to_hash + env.merge!({ 'COLUMNS' => cols, 'MANWIDTH' => cols, 'TERM' => 'dumb' }) + + # note: I don't care for the styles groff and man2html throw + # on us, I just want indented and wrapped text with + # for URLs. + + # try man-db options, first: + str = IO.popen(env, ['man', '--nh', '--nj', *argv], &:read) + + if str.empty? || !$?.success? + str = IO.popen(env, ['man', *argv], &:read) + end + if $?.success? + sections = '[A-Z][A-Z ]+' + str = str.split(/^(#{sections})$/mo) + + str = str.map! do |s| + case s + when /\A(#{sections})$/o + # this is to be compatible with HTML fragments pandoc used + sec = $1 + anchor = sec.downcase.tr(' ', '-') + "#{sec}" + else + state = linkify_1(s) + s.encode!(xml: :text) + linkify_2(state, s) + s.rstrip! + s.empty? ? '' : "
#{s}
" + end + end.join + + out.print(str) + + # use mtime of the original source + if out.respond_to?(:path) + path = out.path + out.close + stat = src_input_stat(argv) + File.utime(stat.atime, stat.mtime, path) if stat + end + end + end + + def src_input_stat(argv) + argv.reverse_each do |f| + next unless File.file?(f) + return File.stat(f) + end + + argv.reverse_each do |f| + path = IO.popen(%W(man -w #{f}), &:read) + path.chomp! + next unless File.file?(path) + return File.stat(path) + end + nil + end + + def linkify_1(str) # :nodoc: + state = {} + str.gsub!(LINK_RE) do + head = $1 || '' + url = $2.dup + tail = ''.dup + + # it's fairly common to end URLs in messages with + # '.', ',' or ';' to denote the end of a statement; + # assume the intent was to end the statement/sentence + # in English + if re = PAIRS[head] + url.sub!(re, '') + tail = $1 + elsif url.sub!(/(\))?([\.,;])\z/, '') + tail = $2 + # require ')' to be paired with '(' + if $1 # ')' + if url.index('(').nil? + tail = ")#{tail}" + else + url += ')' + end + end + elsif url !~ /\(/ && url.sub!(/\)\z/, '') + tail = ')' + end + + # salt this, as this could be exploited to show + # links in the HTML which don't show up in the raw mail. + key = Digest::MD5.hexdigest("#{url}#{SALT}").freeze + state[key] = url + "#{head}OLD-LINK-#{key}#{tail}" + end + state + end + + def linkify_2(state, str) # :nodoc: + # Added "OLD-LINK-" prefix to avoid false-positives on git commits + str.gsub!(/\bOLD-LINK-([a-f0-9]{32})\b/) do + key = $1 + url = state[key] + if url + %Q{#{url.encode(xml: :text)}
} + else + # false positive or somebody tried to mess with us + key + end + end + end +end