From 702e3ad47738363ab847f43155e047c7c6612f80 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Mon, 29 Jan 2018 00:43:45 +0000 Subject: player: support guessing encodings for comments This can be helpful for end users and is close to what other players use. We can fallback to Encoding.default_external by default (typically UTF-8) and then again using `charlock_holmes' if installed. Note: path names remain binary, because that's how proper filesystems operate. --- lib/dtas.rb | 2 ++ lib/dtas/encoding.rb | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/dtas/source/sox.rb | 4 +++- test/test_encoding.rb | 20 +++++++++++++++++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 lib/dtas/encoding.rb create mode 100644 test/test_encoding.rb diff --git a/lib/dtas.rb b/lib/dtas.rb index ac416d7..3c2cdb4 100644 --- a/lib/dtas.rb +++ b/lib/dtas.rb @@ -42,3 +42,5 @@ end require_relative 'dtas/compat_onenine' require_relative 'dtas/spawn_fix' +require_relative 'dtas/encoding' +DTAS.extend(DTAS::Encoding) diff --git a/lib/dtas/encoding.rb b/lib/dtas/encoding.rb new file mode 100644 index 0000000..71c877f --- /dev/null +++ b/lib/dtas/encoding.rb @@ -0,0 +1,58 @@ +# Copyright (C) 2018 all contributors +# License: GPL-3.0+ +# frozen_string_literal: true + +# This module gets included in DTAS +module DTAS::Encoding # :nodoc: + def self.extended(mod) + mod.instance_eval { @charlock_holmes = nil} + end + +private + + def try_enc_harder(str, enc, old) # :nodoc: + case @charlock_holmes + when nil + begin + require 'charlock_holmes' + @charlock_holmes = CharlockHolmes::EncodingDetector.new + rescue LoadError + warn "`charlock_holmes` gem not available for encoding detection" + @charlock_holmes = false + end + when false + enc_fallback(str, enc, old) + else + res = @charlock_holmes.detect(str) + if det = res[:ruby_encoding] + str.force_encoding(det) + warn "charlock_holmes detected #{str.inspect} as #{det}..." + str.valid_encoding? or enc_fallback(str, det, old) + else + enc_fallback(str, enc, old) + end + end + str + end + + def enc_fallback(str, enc, old) # :nodoc: + str.force_encoding(old) + warn "could not detect encoding for #{str.inspect} (not #{enc})" + end + +public + + def try_enc(str, enc, harder = true) # :nodoc: + old = str.encoding + return str if old == enc + str.force_encoding(enc) + unless str.valid_encoding? + if harder + try_enc_harder(str, enc, old) + else + enc_fallback(str, enc, old) + end + end + str + end +end diff --git a/lib/dtas/source/sox.rb b/lib/dtas/source/sox.rb index f702b41..5e967c1 100644 --- a/lib/dtas/source/sox.rb +++ b/lib/dtas/source/sox.rb @@ -50,17 +50,19 @@ class DTAS::Source::Sox # :nodoc: out =~ /^Sample Rate\s*:\s*(\d+)/n and dst['rate'] = $1.to_i out =~ /^Precision\s*:\s*(\d+)-bit/n and dst['bits'] = $1.to_i + enc = Encoding.default_external # typically Encoding::UTF_8 if out =~ /\nComments\s*:[ \t]*\n?(.*)\z/mn comments = dst['comments'] = {} key = nil $1.split(/\n/n).each do |line| if line.sub!(/^([^=]+)=/ni, '') - key = DTAS.dedupe_str($1.upcase) + key = DTAS.dedupe_str(DTAS.try_enc($1.upcase, enc)) end (comments[key] ||= ''.b) << "#{line}\n" unless line.empty? end comments.each do |k,v| v.chomp! + DTAS.try_enc(v, enc) comments[k] = DTAS.dedupe_str(v) end end diff --git a/test/test_encoding.rb b/test/test_encoding.rb new file mode 100644 index 0000000..d9af968 --- /dev/null +++ b/test/test_encoding.rb @@ -0,0 +1,20 @@ +# Copyright (C) 2018 all contributors +# License: GPL-3.0+ +# frozen_string_literal: true +require './test/helper' +require 'dtas' +require 'yaml' + +class TestEncoding < Testcase + def test_encoding + data = < +--- +comments: + ARTIST: !binary |- + RW5yaXF1ZSBSb2Ryw61ndWV6 +EOD + hash = YAML.load(data) + artist = DTAS.try_enc(hash['comments']['ARTIST'], Encoding::UTF_8) + assert_equal 'Enrique Rodríguez', artist + end +end -- cgit v1.2.3-24-ge0c7