diff --git a/lib/rackstash/fields/abstract_collection.rb b/lib/rackstash/fields/abstract_collection.rb index 5dfa130..dfa4100 100644 --- a/lib/rackstash/fields/abstract_collection.rb +++ b/lib/rackstash/fields/abstract_collection.rb @@ -11,9 +11,13 @@ require 'uri' require 'concurrent' +require 'rackstash/helpers' + module Rackstash module Fields class AbstractCollection + include Rackstash::Helpers::UTF8 + # Equality -- Two collections are equal if they are of exactly the same # class and contain the same raw data according to `Object#==`. # @@ -87,22 +91,6 @@ module Rackstash end end - # Encode the given String in UTF-8. If the given `str` is already - # correctly encoded and frozen, we just return it unchanged. In all other - # cases we return a UTF-8 encoded and frozen copy of the string. - # - # @param str [String, #to_s] - # @return [String] - def utf8_encode(str) - if str.instance_of?(String) && str.encoding == Encoding::UTF_8 && str.valid_encoding? - str.frozen? ? str : str.dup.freeze - else - str = str.to_s - str = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace) - str.freeze - end - end - def resolve_value(value, scope: nil) return value unless value.is_a?(Proc) scope.nil? ? value.call : scope.instance_exec(&value) @@ -189,7 +177,7 @@ module Rackstash return normalize(value, scope: scope, wrap: wrap) end - utf8_encode(value.inspect) + utf8_encode(value.inspect.freeze) end end end diff --git a/lib/rackstash/fields/tags.rb b/lib/rackstash/fields/tags.rb index 3b76623..3fdc5b0 100644 --- a/lib/rackstash/fields/tags.rb +++ b/lib/rackstash/fields/tags.rb @@ -76,7 +76,7 @@ module Rackstash value.flatten! value else - utf8_encode(value).strip + utf8_encode(value).strip.freeze end end end diff --git a/lib/rackstash/helpers.rb b/lib/rackstash/helpers.rb new file mode 100644 index 0000000..e403eef --- /dev/null +++ b/lib/rackstash/helpers.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +# Copyright 2017 Holger Just +# +# This software may be modified and distributed under the terms +# of the MIT license. See the LICENSE.txt file for details. + +module Rackstash + # Some utility function which are used throughout Rackstash. + module Helpers + end +end + +require 'rackstash/helpers/utf8' diff --git a/lib/rackstash/helpers/utf8.rb b/lib/rackstash/helpers/utf8.rb new file mode 100644 index 0000000..6665216 --- /dev/null +++ b/lib/rackstash/helpers/utf8.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +# Copyright 2017 Holger Just +# +# This software may be modified and distributed under the terms +# of the MIT license. See the LICENSE.txt file for details. + +module Rackstash + module Helpers + # Provide helper functions to help with UTF8 handling of Strings. + module UTF8 + protected + + # Encode the given String in UTF-8. If the given `str` is already + # correctly encoded and frozen, we just return it unchanged. In all other + # cases we return a UTF-8 encoded and frozen copy of the string. + # + # @param str [String, #to_s] + # @return [String] + def utf8_encode(str) + if str.instance_of?(String) && str.encoding == Encoding::UTF_8 && str.valid_encoding? + str.frozen? ? str : str.dup.freeze + else + str = str.to_s + str = str.encode(Encoding::UTF_8, invalid: :replace, undef: :replace) + str.freeze + end + end + end + end +end diff --git a/spec/rackstash/fields/abstract_collection_spec.rb b/spec/rackstash/fields/abstract_collection_spec.rb index 5f3b8fc..7a8f017 100644 --- a/spec/rackstash/fields/abstract_collection_spec.rb +++ b/spec/rackstash/fields/abstract_collection_spec.rb @@ -116,47 +116,14 @@ describe Rackstash::Fields::AbstractCollection do end describe '#normalize' do - describe 'with String' do - it 'transforms encoding to UTF-8' do - utf8_str = 'Dönerstraße' - latin_str = utf8_str.encode(Encoding::ISO8859_9) - expect(latin_str.encoding).to eql Encoding::ISO8859_9 + it 'encodes Strings to UTF-8' do + utf8_str = 'Dönerstraße' + latin_str = utf8_str.encode(Encoding::ISO8859_9) + expect(latin_str.encoding).to eql Encoding::ISO8859_9 - expect(normalize(latin_str)).to eql utf8_str - expect(normalize(latin_str).encoding).to eql Encoding::UTF_8 - expect(normalize(latin_str)).to be_frozen - end - - it 'replaces invalid characters in correctly encoded strings' do - binary = Digest::SHA256.digest('string') - - expect(normalize(binary)).to include '�' - expect(normalize(binary).encoding).to eql Encoding::UTF_8 - expect(normalize(binary)).to be_frozen - end - - it 'replaces invalid characters in incorrectly encoded strings' do - strange = Digest::SHA256.digest('string').force_encoding(Encoding::UTF_8) - - expect(normalize(strange)).to include '�' - expect(normalize(strange).encoding).to eql Encoding::UTF_8 - expect(normalize(strange)).to be_frozen - end - - it 'dups and freezes valid strings' do - valid = String.new('Dönerstraße') - expect(valid).to_not be_frozen - - expect(normalize(valid)).to eql(valid) - # Not object-equal since the string was dup'ed - expect(normalize(valid)).not_to equal valid - expect(normalize(valid)).to be_frozen - end - - it 'does not alter valid frozen strings' do - valid = 'Dönerstraße'.freeze - expect(normalize(valid)).to equal(valid) - end + expect(normalize(latin_str)).to eql utf8_str + expect(normalize(latin_str).encoding).to eql Encoding::UTF_8 + expect(normalize(latin_str)).to be_frozen end it 'transforms Symbol to String' do diff --git a/spec/rackstash/helpers/utf8_spec.rb b/spec/rackstash/helpers/utf8_spec.rb new file mode 100644 index 0000000..460729b --- /dev/null +++ b/spec/rackstash/helpers/utf8_spec.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +# Copyright 2017 Holger Just +# +# This software may be modified and distributed under the terms +# of the MIT license. See the LICENSE.txt file for details. + +require 'spec_helper' + +require 'rackstash/helpers/utf8' + +describe Rackstash::Helpers::UTF8 do + it 'only defines protected methods' do + expect(described_class.public_instance_methods(false)).to be_empty + end + + describe '#utf8_encode' do + def utf8_encode(*args) + Object.new.extend(described_class).send(:utf8_encode, *args) + end + + it 'transforms encoding to UTF-8' do + utf8_str = 'Dönerstraße' + latin_str = utf8_str.encode(Encoding::ISO8859_9) + expect(latin_str.encoding).to eql Encoding::ISO8859_9 + + expect(utf8_encode(latin_str)).to eql utf8_str + expect(utf8_encode(latin_str).encoding).to eql Encoding::UTF_8 + expect(utf8_encode(latin_str)).to be_frozen + end + + it 'replaces invalid characters in correctly encoded strings' do + binary = Digest::SHA256.digest('string') + + expect(utf8_encode(binary)).to include '�' + expect(utf8_encode(binary).encoding).to eql Encoding::UTF_8 + expect(utf8_encode(binary)).to be_frozen + end + + it 'replaces invalid characters in incorrectly encoded strings' do + strange = Digest::SHA256.digest('string').force_encoding(Encoding::UTF_8) + + expect(utf8_encode(strange)).to include '�' + expect(utf8_encode(strange).encoding).to eql Encoding::UTF_8 + expect(utf8_encode(strange)).to be_frozen + end + + it 'dups and freezes valid strings' do + valid = String.new('Dönerstraße') + expect(valid).to_not be_frozen + + expect(utf8_encode(valid)).to eql(valid) + # Not object-equal since the string was dup'ed + expect(utf8_encode(valid)).not_to equal valid + expect(utf8_encode(valid)).to be_frozen + end + + it 'does not alter valid frozen strings' do + valid = 'Dönerstraße'.freeze + expect(utf8_encode(valid)).to equal(valid) + end + end +end