assafvayner's picture
assafvayner HF Staff
updates
c9619ed
// Binary parsers for xorb and shard files
import type {
ParsedFileMetadata,
Chunk,
ChunkHeader,
ShardData,
MerkleHash,
MDBShardFileHeader,
MDBShardFileFooter,
FileDataSequenceHeader,
FileDataSequenceEntry,
FileVerificationEntry,
FileMetadataExt,
CASChunkSequenceHeader,
CASChunkSequenceEntry,
MDBFileInfo,
MDBCASInfo,
} from "./types.js";
import { MDB_SHARD_HEADER_TAG, XORB_IDENT } from "./types.js";
export class BinaryReader {
private data: Uint8Array;
private offset: number = 0;
constructor(data: Uint8Array) {
this.data = data;
}
readUint8(): number {
if (this.offset >= this.data.length) {
console.trace();
throw new Error("Unexpected end of data");
}
return this.data[this.offset++];
}
readUint32LE(): number {
if (this.offset + 4 > this.data.length) {
console.trace();
throw new Error("Unexpected end of data");
}
const result = new DataView(this.data.buffer).getUint32(this.offset, true);
this.offset += 4;
return result;
}
readUint64LE(): bigint {
if (this.offset + 8 > this.data.length) {
console.trace();
throw new Error("Unexpected end of data");
}
const result = new DataView(this.data.buffer).getBigUint64(
this.offset,
true
);
this.offset += 8;
return result;
}
readBytes(length: number): Uint8Array {
if (this.offset + length > this.data.length) {
console.trace();
throw new Error("Unexpected end of data");
}
const result = this.data.slice(this.offset, this.offset + length);
this.offset += length;
return result;
}
readHash(): MerkleHash {
const u64_0 = this.readUint64LE();
const u64_1 = this.readUint64LE();
const u64_2 = this.readUint64LE();
const u64_3 = this.readUint64LE();
return { data: [u64_0, u64_1, u64_2, u64_3] };
}
readString(length: number): string {
const bytes = this.readBytes(length);
return new TextDecoder().decode(bytes);
}
seek(position: number): void {
this.offset = position;
}
seekFromEnd(offsetFromEnd: number): void {
this.offset = this.data.length - offsetFromEnd;
}
get position(): number {
return this.offset;
}
get remaining(): number {
return this.data.length - this.offset;
}
}
function arraysEqual(a: Uint8Array, b: Uint8Array): boolean {
if (a.length !== b.length) return false;
for (let i = 0; i < a.length; i++) {
if (a[i] !== b[i]) return false;
}
return true;
}
function isBookendHash(hash: MerkleHash): boolean {
// Bookend hash is all 0xFF bytes (all 64-bit values should be 0xFFFFFFFFFFFFFFFF)
return hash.data.every((value) => value === 0xffffffffffffffffn);
}
export function formatHash(hash: MerkleHash): string {
// Convert each 64-bit integer to little-endian byte representation
return hash.data
.map((value) => {
// Convert bigint to 8 bytes in little-endian order
const bytes = [];
let temp = value;
for (let i = 0; i < 8; i++) {
bytes.push(Number(temp & 0xffn));
temp = temp >> 8n;
}
return bytes.map((b) => b.toString(16).padStart(2, "0")).join("");
})
.join("");
}
// File type detection removed - type is now specified by user selection
function parseXorbFile(data: Uint8Array): Chunk[] {
const reader = new BinaryReader(data);
const chunks: Chunk[] = [];
while (reader.remaining > 0) {
// Check if we have enough bytes for a header
if (reader.remaining < 8) {
console.error("Unexpected end of data parsing xorb file");
break;
}
const header_bytes = reader.readBytes(8);
let is_xorb_ident = true;
// Urgh how do I compare two Uint8Arrays?
for (let i = 0; i < 7; i++) {
if (header_bytes[i] !== XORB_IDENT[i]) {
is_xorb_ident = false;
break;
}
}
if (is_xorb_ident) {
// reached optional xorb footer, skip rest
break;
}
const header = new DataView(header_bytes.buffer);
const version = header.getUint8(0);
const compressed_size =
header.getUint8(1) |
(header.getUint8(2) << 8) |
(header.getUint8(3) << 16);
const compression_type = header.getUint8(4);
const uncompressed_size =
header.getUint8(5) |
(header.getUint8(6) << 8) |
(header.getUint8(7) << 16);
const chunkHeader: ChunkHeader = {
version,
compressed_size,
compression_type,
uncompressed_size,
};
const compressed_data = reader.readBytes(compressed_size);
chunks.push({ header: chunkHeader, compressed_data });
}
return chunks;
}
function parseShardFile(data: Uint8Array): ShardData {
const reader = new BinaryReader(data);
// Parse header
const tag = reader.readBytes(32);
if (!arraysEqual(tag, MDB_SHARD_HEADER_TAG)) {
throw new Error("Invalid shard file header tag");
}
const header: MDBShardFileHeader = {
tag,
version: Number(reader.readUint64LE()),
footer_size: Number(reader.readUint64LE()),
};
if (header.version !== 2) {
throw new Error(`Unsupported shard header version: ${header.version}`);
}
// Parse footer (from end of file)
reader.seekFromEnd(header.footer_size);
const version = Number(reader.readUint64LE());
const file_info_offset = Number(reader.readUint64LE());
const cas_info_offset = Number(reader.readUint64LE());
// Skip first buffer (48 bytes)
reader.readBytes(48);
const chunk_hash_hmac_key = reader.readHash();
const shard_creation_timestamp = Number(reader.readUint64LE());
const shard_key_expiry = Number(reader.readUint64LE());
// Skip second buffer (72 bytes)
reader.readBytes(72);
const footer_offset = Number(reader.readUint64LE());
const footer: MDBShardFileFooter = {
version,
file_info_offset,
cas_info_offset,
chunk_hash_hmac_key,
shard_creation_timestamp,
shard_key_expiry,
footer_offset,
};
if (footer.version !== 1) {
throw new Error(`Unsupported shard footer version: ${footer.version}`);
}
// Parse file info section
const file_info: MDBFileInfo[] = [];
reader.seek(footer.file_info_offset);
while (reader.position < footer.cas_info_offset) {
const pos = reader.position;
const file_hash = reader.readHash();
// Check for bookend
if (isBookendHash(file_hash)) {
reader.readBytes(16); // unused
break;
}
const file_flags = reader.readUint32LE();
const num_entries = reader.readUint32LE();
const _unused = reader.readBytes(8);
const header: FileDataSequenceHeader = {
file_hash,
file_flags,
num_entries,
_unused,
};
// Read entries
const entries: FileDataSequenceEntry[] = [];
for (let i = 0; i < num_entries; i++) {
const cas_hash = reader.readHash();
const cas_flags = reader.readUint32LE();
const unpacked_segment_bytes = reader.readUint32LE();
const chunk_index_start = reader.readUint32LE();
const chunk_index_end = reader.readUint32LE();
entries.push({
cas_hash,
cas_flags,
unpacked_segment_bytes,
chunk_index_start,
chunk_index_end,
});
}
// Read verification entries if present
let verification_entries: FileVerificationEntry[] | undefined;
if (file_flags & 0x80000000) {
verification_entries = [];
for (let i = 0; i < num_entries; i++) {
verification_entries.push({
chunk_hash: reader.readHash(),
_unused: reader.readBytes(16),
});
}
}
// Read metadata extension if present
let metadata_ext: FileMetadataExt | undefined;
if (file_flags & 0x40000000) {
metadata_ext = {
sha256: reader.readHash(),
_unused: reader.readBytes(16),
};
}
file_info.push({
header,
entries,
verification_entries,
metadata_ext,
});
}
// Parse CAS info section
const cas_info: MDBCASInfo[] = [];
reader.seek(footer.cas_info_offset);
while (reader.position < footer.footer_offset) {
const cas_hash = reader.readHash();
// Check for bookend
if (isBookendHash(cas_hash)) {
break;
}
const cas_flags = reader.readUint32LE();
const num_entries = reader.readUint32LE();
const num_bytes_in_cas = reader.readUint32LE();
const num_bytes_on_disk = reader.readUint32LE();
const header: CASChunkSequenceHeader = {
cas_hash,
cas_flags,
num_entries,
num_bytes_in_cas,
num_bytes_on_disk,
};
// Read entries
const entries: CASChunkSequenceEntry[] = [];
for (let i = 0; i < num_entries; i++) {
entries.push({
chunk_hash: reader.readHash(),
chunk_byte_range_start: reader.readUint32LE(),
unpacked_segment_bytes: reader.readUint32LE(),
_unused: Number(reader.readUint64LE()),
});
}
cas_info.push({
header,
entries,
});
}
return {
header,
footer,
file_info,
cas_info,
};
}
export async function parseFile(
file: File,
fileType: "xorb" | "shard"
): Promise<ParsedFileMetadata> {
try {
const arrayBuffer = await file.arrayBuffer();
const data = new Uint8Array(arrayBuffer);
let parsedData: Chunk[] | ShardData;
if (fileType === "xorb") {
parsedData = parseXorbFile(data);
} else {
parsedData = parseShardFile(data);
}
return {
type: fileType,
filename: file.name,
fileSize: file.size,
data: parsedData,
};
} catch (error) {
return {
type: fileType,
filename: file.name,
fileSize: file.size,
data: [] as any,
error: error instanceof Error ? error.message : "Unknown error occurred",
};
}
}
// Helper functions for displaying data
export function formatBytes(bytes: number): string {
if (bytes === 0) return "0 B";
const k = 1000;
const sizes = ["B", "KB", "MB", "GB"];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + " " + sizes[i];
}
export function formatTimestamp(timestamp: number): string {
return new Date(timestamp * 1000).toISOString();
}
export function formatHashShort(hash: MerkleHash): string {
const fullHash = formatHash(hash);
return fullHash.substring(0, 16) + "...";
}