Fix CSV import file encoding auto-detection failure with multibyte characters (#41464).

Patch by Go MAEDA (user:maeda). git-svn-id: https://svn.redmine.org/redmine/trunk@23150 e93f8b46-1217-0410-a6f0-8f06a7374b81
2026-03-17 22:48:14 +00:00 · 2024-10-20 06:47:28 +00:00 · 2024-10-20 06:47:28 +00:00 · 1d46be8b0f
commit 1d46be8b0f
parent 7c66cdaaaf
3 changed files with 49 additions and 1 deletions
--- a/app/models/import.rb
+++ b/app/models/import.rb
@ -69,7 +69,7 @@ class Import < ApplicationRecord
    encoding = lu(user, :general_csv_encoding)
    if file_exists?
      begin
-        content = File.read(filepath, 256)
+        content = read_file_head

        separator = [',', ';'].max_by {|sep| content.count(sep)}
        wrapper = ['"', "'"].max_by {|quote_char| content.count(quote_char)}
@ -248,6 +248,20 @@ class Import < ApplicationRecord

  private

+  # Reads lines from the beginning of the file, up to the specified number
+  # of bytes (max_read_bytes).
+  def read_file_head(max_read_bytes = 4096)
+    return '' unless file_exists?
+    return File.read(filepath, mode: 'rb') if File.size(filepath) <= max_read_bytes
+
+    # The last byte of the chunk may be part of a multi-byte character,
+    # causing an invalid byte sequence. To avoid this, it truncates
+    # the chunk at the last LF character, if found.
+    chunk = File.read(filepath, max_read_bytes)
+    last_lf_index = chunk.rindex("\n")
+    last_lf_index ? chunk[..last_lf_index] : chunk
+  end
+
  def read_rows
    return unless file_exists?

--- a/test/fixtures/files/mbcs-multiline-text.txt
+++ b/test/fixtures/files/mbcs-multiline-text.txt
@ -0,0 +1,17 @@
+An emoticon is represented by 4 bytes in UTF-8 encoding.
+
+If you simply read the first 4096 bytes of this file, the trailing characters of a multi-byte sequence might be cut off, resulting in an invalid UTF-8 string.
+
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
--- a/test/unit/issue_import_test.rb
+++ b/test/unit/issue_import_test.rb
@ -464,6 +464,23 @@ class IssueImportTest < ActiveSupport::TestCase
    end
  end

+  def test_encoding_guessing_respects_multibyte_boundaries
+    # Reading a specified number of bytes from the beginning of this file
+    # may stop in the middle of a multi-byte character, which can lead to
+    # an invalid UTF-8 string.
+    test_file = 'mbcs-multiline-text.txt'
+    chunk = File.read(Rails.root.join('test', 'fixtures', 'files', test_file), 4096)
+    chunk.force_encoding('UTF-8') # => "...😃😄😅\xF0\x9F"
+    assert_not chunk.valid_encoding?
+
+    import = generate_import(test_file)
+    with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do
+      import.set_default_settings
+      guessed_encoding = import.settings['encoding']
+      assert_equal 'UTF-8', guessed_encoding
+    end
+  end
+
  def test_set_default_settings_should_detect_field_wrapper
    to_test = {
      'import_issues.csv' => '"',