Fix an issue where the charset in the link preview of some pages was not identified correctly.

This commit is contained in:
Yuval Razieli
2023-07-08 16:14:13 +03:00
committed by Clark Chen
parent 5ca025544e
commit 23ef8c78bd

View File

@@ -1,6 +1,7 @@
package org.thoughtcrime.securesms.util;
import androidx.annotation.NonNull;
import androidx.core.text.HtmlCompat;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -8,12 +9,16 @@ import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import okhttp3.MediaType;
import okhttp3.ResponseBody;
public final class OkHttpUtil {
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([a-zA-Z0-9\\\\-]+)[\"']?");
private OkHttpUtil() {}
public static byte[] readAsBytes(@NonNull InputStream bodyStream, long sizeLimit) throws IOException {
@@ -41,8 +46,24 @@ public final class OkHttpUtil {
byte[] data = readAsBytes(body.byteStream(), sizeLimit);
MediaType contentType = body.contentType();
Charset charset = contentType != null ? contentType.charset(StandardCharsets.UTF_8) : StandardCharsets.UTF_8;
Charset charset = contentType != null ? contentType.charset(null) : null;
charset = charset == null ? getHtmlCharset(new String(data)) : charset;
return new String(data, Objects.requireNonNull(charset));
}
private static @NonNull Charset getHtmlCharset(String html) {
Matcher charsetMatcher = CHARSET_PATTERN.matcher(html);
if (charsetMatcher.find() && charsetMatcher.groupCount() > 0) {
try {
return Objects.requireNonNull(Charset.forName(fromDoubleEncoded(charsetMatcher.group(1))));
} catch (Exception ignored) {}
}
return StandardCharsets.UTF_8;
}
private static @NonNull String fromDoubleEncoded(@NonNull String html) {
return HtmlCompat.fromHtml(HtmlCompat.fromHtml(html, 0).toString(), 0).toString();
}
}