feat: from utf8 lossy with <2122> like vim

This commit is contained in:
ChomeNS
2025-04-30 17:43:33 +07:00
parent 96f5f66f01
commit 4e15434863
3 changed files with 53 additions and 3 deletions

View File

@@ -1 +1 @@
2983
2985

View File

@@ -12,7 +12,6 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.*;
@@ -279,7 +278,7 @@ public class NBSConverter implements Converter {
}
final byte[] arr = new byte[length];
buffer.get(arr, 0, length);
return new String(arr, StandardCharsets.UTF_8);
return StringUtilities.fromUTF8Lossy(arr);
}
private static long getMilliTime (final long tick, final double tempo) {

View File

@@ -29,6 +29,57 @@ public class StringUtilities {
return removedCommand.toString();
}
// Author: ChatGPT
public static String fromUTF8Lossy (final byte[] input) {
final StringBuilder result = new StringBuilder();
int i = 0;
while (i < input.length) {
final byte b = input[i];
if ((b & 0x80) == 0) {
// ASCII byte (0xxxxxxx)
result.append((char) b);
} else {
// Try to decode as UTF-8 multibyte sequence
final int bytesRemaining = input.length - i;
// UTF-8 rules: number of bytes in sequence based on first byte
int seqLen = -1;
if ((b & 0xE0) == 0xC0 && bytesRemaining >= 2) seqLen = 2;
else if ((b & 0xF0) == 0xE0 && bytesRemaining >= 3) seqLen = 3;
else if ((b & 0xF8) == 0xF0 && bytesRemaining >= 4) seqLen = 4;
if (seqLen > 1) {
boolean valid = true;
for (int j = 1; j < seqLen; j++) {
if ((input[i + j] & 0xC0) != 0x80) {
valid = false;
break;
}
}
if (valid) {
try {
final String s = new String(input, i, seqLen, StandardCharsets.UTF_8);
result.append(s);
i += seqLen;
continue;
} catch (final Exception e) {
// Fall through to escape
}
}
}
// If invalid UTF-8 sequence or unknown pattern, escape the byte
result.append(String.format("<%04X>", b & 0xFF));
}
i++;
}
return result.toString();
}
// https://stackoverflow.com/a/35148974/18518424
public static String truncateToFitUtf8ByteLength (final String s, final int maxBytes) {
if (s == null) {