From 702e3ad47738363ab847f43155e047c7c6612f80 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 29 Jan 2018 00:43:45 +0000 Subject: player: support guessing encodings for comments This can be helpful for end users and is close to what other players use. We can fallback to Encoding.default_external by default (typically UTF-8) and then again using `charlock_holmes' if installed. Note: path names remain binary, because that's how proper filesystems operate. --- lib/dtas/encoding.rb | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 lib/dtas/encoding.rb (limited to 'lib/dtas/encoding.rb') diff --git a/lib/dtas/encoding.rb b/lib/dtas/encoding.rb new file mode 100644 index 0000000..71c877f --- /dev/null +++ b/lib/dtas/encoding.rb @@ -0,0 +1,58 @@ +# Copyright (C) 2018 all contributors +# License: GPL-3.0+ +# frozen_string_literal: true + +# This module gets included in DTAS +module DTAS::Encoding # :nodoc: + def self.extended(mod) + mod.instance_eval { @charlock_holmes = nil} + end + +private + + def try_enc_harder(str, enc, old) # :nodoc: + case @charlock_holmes + when nil + begin + require 'charlock_holmes' + @charlock_holmes = CharlockHolmes::EncodingDetector.new + rescue LoadError + warn "`charlock_holmes` gem not available for encoding detection" + @charlock_holmes = false + end + when false + enc_fallback(str, enc, old) + else + res = @charlock_holmes.detect(str) + if det = res[:ruby_encoding] + str.force_encoding(det) + warn "charlock_holmes detected #{str.inspect} as #{det}..." + str.valid_encoding? or enc_fallback(str, det, old) + else + enc_fallback(str, enc, old) + end + end + str + end + + def enc_fallback(str, enc, old) # :nodoc: + str.force_encoding(old) + warn "could not detect encoding for #{str.inspect} (not #{enc})" + end + +public + + def try_enc(str, enc, harder = true) # :nodoc: + old = str.encoding + return str if old == enc + str.force_encoding(enc) + unless str.valid_encoding? + if harder + try_enc_harder(str, enc, old) + else + enc_fallback(str, enc, old) + end + end + str + end +end -- cgit v1.2.3-24-ge0c7