Better binary file detection

This commit is contained in:
Nico Mexis 2022-01-12 00:44:23 +01:00
parent f85ee82202
commit 5bbe04befe
No known key found for this signature in database
GPG key ID: 27D6E17CE092AB78
2 changed files with 15 additions and 7 deletions

View file

@ -73,13 +73,12 @@ public class FileViewer extends ResourceViewer
final byte[] contents = resource.getResourceBytes();
final String nameLowerCase = this.resource.name.toLowerCase();
final String onlyName = FilenameUtils.getName(nameLowerCase);
final String contentsAsString = new String(contents);
final boolean hexViewerOnly = BytecodeViewer.viewer.viewPane1.getSelectedDecompiler() == Decompiler.HEXCODE_VIEWER &&
BytecodeViewer.viewer.viewPane2.getSelectedDecompiler() == Decompiler.NONE &&
BytecodeViewer.viewer.viewPane3.getSelectedDecompiler() == Decompiler.NONE;
//image viewer
if (!MiscUtils.isPureAscii(contentsAsString) || hexViewerOnly)
if (MiscUtils.guessIfBinary(contents) || hexViewerOnly)
{
//TODO:
// + Add file header checks
@ -144,7 +143,7 @@ public class FileViewer extends ResourceViewer
textArea.setCodeFoldingEnabled(true);
SyntaxLanguage.setLanguage(textArea, nameLowerCase);
textArea.setText(contentsAsString);
textArea.setText(new String(contents));
textArea.setFont(new Font(Font.MONOSPACED, Font.PLAIN, (int) BytecodeViewer.viewer.fontSpinner.getValue()));
textArea.setCaretPosition(0);

View file

@ -54,7 +54,6 @@ import static the.bytecode.club.bytecodeviewer.BytecodeViewer.gson;
public class MiscUtils
{
private static final CharsetEncoder asciiEncoder = StandardCharsets.US_ASCII.newEncoder(); // or "ISO-8859-1" for ISO Latin 1
private static final String AB = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
private static final String AN = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
private static final Random rnd = new Random();
@ -248,8 +247,18 @@ public class MiscUtils
list.remove(list.size() - 1);
}
public static boolean isPureAscii(String v) {
return asciiEncoder.canEncode(v);
/**
* Returns whether the bytes most likely represent binary data.
* Based on https://stackoverflow.com/a/13533390/5894824
*/
public static boolean guessIfBinary(byte[] data) {
double ascii = 0;
double other = 0;
for (byte b : data) {
if (b == 0x09 || b == 0x0A || b == 0x0C || b == 0x0D || (b >= 0x20 && b <= 0x7E)) ascii++;
else other++;
}
return other != 0 && other / (ascii + other) > 0.25;
}
public static Language guessLanguage()