From 4e154348637505aaa7cf8ed6912322ace46ef4b9 Mon Sep 17 00:00:00 2001 From: ChomeNS <95471003+ChomeNS@users.noreply.github.com> Date: Wed, 30 Apr 2025 17:43:33 +0700 Subject: [PATCH] feat: from utf8 lossy with `<2122>` like vim --- build-number.txt | 2 +- .../chomens_bot/song/NBSConverter.java | 3 +- .../chomens_bot/util/StringUtilities.java | 51 +++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/build-number.txt b/build-number.txt index 7c4af730..ce098b84 100644 --- a/build-number.txt +++ b/build-number.txt @@ -1 +1 @@ -2983 \ No newline at end of file +2985 \ No newline at end of file diff --git a/src/main/java/me/chayapak1/chomens_bot/song/NBSConverter.java b/src/main/java/me/chayapak1/chomens_bot/song/NBSConverter.java index b028c00d..ce81738d 100644 --- a/src/main/java/me/chayapak1/chomens_bot/song/NBSConverter.java +++ b/src/main/java/me/chayapak1/chomens_bot/song/NBSConverter.java @@ -12,7 +12,6 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.*; @@ -279,7 +278,7 @@ public class NBSConverter implements Converter { } final byte[] arr = new byte[length]; buffer.get(arr, 0, length); - return new String(arr, StandardCharsets.UTF_8); + return StringUtilities.fromUTF8Lossy(arr); } private static long getMilliTime (final long tick, final double tempo) { diff --git a/src/main/java/me/chayapak1/chomens_bot/util/StringUtilities.java b/src/main/java/me/chayapak1/chomens_bot/util/StringUtilities.java index d1cd06a1..362300e1 100644 --- a/src/main/java/me/chayapak1/chomens_bot/util/StringUtilities.java +++ b/src/main/java/me/chayapak1/chomens_bot/util/StringUtilities.java @@ -29,6 +29,57 @@ public class StringUtilities { return removedCommand.toString(); } + // Author: ChatGPT + public static String fromUTF8Lossy (final byte[] input) { + final StringBuilder result = new StringBuilder(); + int i = 0; + + while (i < input.length) { + final byte b = input[i]; + + if ((b & 0x80) == 0) { + // ASCII byte (0xxxxxxx) + result.append((char) b); + } else { + // Try to decode as UTF-8 multibyte sequence + final int bytesRemaining = input.length - i; + + // UTF-8 rules: number of bytes in sequence based on first byte + int seqLen = -1; + if ((b & 0xE0) == 0xC0 && bytesRemaining >= 2) seqLen = 2; + else if ((b & 0xF0) == 0xE0 && bytesRemaining >= 3) seqLen = 3; + else if ((b & 0xF8) == 0xF0 && bytesRemaining >= 4) seqLen = 4; + + if (seqLen > 1) { + boolean valid = true; + for (int j = 1; j < seqLen; j++) { + if ((input[i + j] & 0xC0) != 0x80) { + valid = false; + break; + } + } + if (valid) { + try { + final String s = new String(input, i, seqLen, StandardCharsets.UTF_8); + result.append(s); + i += seqLen; + continue; + } catch (final Exception e) { + // Fall through to escape + } + } + } + + // If invalid UTF-8 sequence or unknown pattern, escape the byte + result.append(String.format("<%04X>", b & 0xFF)); + } + + i++; + } + + return result.toString(); + } + // https://stackoverflow.com/a/35148974/18518424 public static String truncateToFitUtf8ByteLength (final String s, final int maxBytes) { if (s == null) {