mirror of
https://github.com/0x5eal/luau-unzip.git
synced 2025-04-04 06:30:53 +01:00
perf: improve EoCD offset detection algorithm
This commit is contained in:
parent
a677432c59
commit
b710e2963c
1 changed files with 117 additions and 32 deletions
149
lib/init.luau
149
lib/init.luau
|
@ -422,42 +422,99 @@ function ZipReader.new(data): ZipReader
|
||||||
end
|
end
|
||||||
|
|
||||||
--[=[
|
--[=[
|
||||||
@within ZipReader
|
@within ZipReader
|
||||||
@method parseCentralDirectory
|
@method findEocdPosition
|
||||||
@private
|
@private
|
||||||
|
|
||||||
Parses the central directory of the ZIP file and populates the `entries` and `directories`
|
Finds the position of the End of Central Directory (EoCD) signature in the ZIP file. This
|
||||||
fields. Used internally during initialization of the [ZipReader].
|
implementation is inspired by that of [async_zip], a Rust library for parsing ZIP files
|
||||||
|
asynchronously.
|
||||||
|
|
||||||
|
This method involves buffered reading in reverse and reverse linear searching along those buffers
|
||||||
|
for the EoCD signature. As a result of the buffered approach, we reduce individual reads when compared
|
||||||
|
to reading every single byte sequentially, by a factor of the buffer size (4 KB by default). The buffer
|
||||||
|
size of 4 KB was arrived at because it aligns with many systems' page sizes, and also provides a
|
||||||
|
good balance between read efficiency (not too small), memory usage (not too large) and CPU cache
|
||||||
|
performance.
|
||||||
|
|
||||||
|
From my primitive benchmarks, this method is ~1.5x faster than the sequential approach.
|
||||||
|
|
||||||
|
**Errors if the ZIP file is invalid.**
|
||||||
|
|
||||||
|
[async_zip]: https://github.com/Majored/rs-async-zip/blob/527bda9/src/base/read/io/locator.rs#L37-L45
|
||||||
|
|
||||||
**Errors if the ZIP file is invalid.**
|
|
||||||
|
|
||||||
@error "Could not find End of Central Directory signature"
|
@error "Could not find End of Central Directory signature"
|
||||||
@error "Invalid Central Directory offset or size"
|
|
||||||
@error "Invalid Central Directory entry signature"
|
@return number -- The offset to the End of Central Directory (including the signature)
|
||||||
@error "Found different entries than specified in Central Directory"
|
|
||||||
]=]
|
]=]
|
||||||
function ZipReader.parseCentralDirectory(self: ZipReader): ()
|
function ZipReader.findEocdPosition(self: ZipReader): number
|
||||||
-- ZIP files are read from the end, starting with the End of Central Directory record
|
local BUFFER_SIZE = 4096
|
||||||
-- The EoCD is at least 22 bytes and contains pointers to the rest of the ZIP structure
|
local SIGNATURE_LENGTH = 4
|
||||||
local bufSize = buffer.len(self.data)
|
local bufSize = buffer.len(self.data)
|
||||||
|
|
||||||
-- Start from the minimum possible position of EoCD (22 bytes from end)
|
-- Start from the minimum possible position of EoCD (22 bytes from end)
|
||||||
local minPos = math.max(0, bufSize - (22 + 65535) --[[ max comment size: 64 KiB ]])
|
local position = math.max(0, bufSize - (22 + 65535) --[[ max comment size: 64 KiB ]])
|
||||||
local pos = bufSize - 22
|
local searchBuf = buffer.create(BUFFER_SIZE)
|
||||||
|
|
||||||
-- Search backwards for the EoCD signature
|
while position < bufSize do
|
||||||
while pos >= minPos do
|
local readSize = math.min(BUFFER_SIZE, bufSize - position)
|
||||||
if buffer.readu32(self.data, pos) == SIGNATURES.END_OF_CENTRAL_DIR then
|
buffer.copy(searchBuf, 0, self.data, position, readSize)
|
||||||
break
|
|
||||||
|
-- Search backwards through buffer for signature
|
||||||
|
for i = readSize - 1, SIGNATURE_LENGTH - 1, -1 do
|
||||||
|
if buffer.readu32(searchBuf, i - SIGNATURE_LENGTH + 1) == SIGNATURES.END_OF_CENTRAL_DIR then
|
||||||
|
return position + i - SIGNATURE_LENGTH + 1
|
||||||
|
end
|
||||||
end
|
end
|
||||||
pos -= 1
|
|
||||||
|
-- Move position backward with overlap for cross-boundary signatures
|
||||||
|
position += BUFFER_SIZE - SIGNATURE_LENGTH
|
||||||
end
|
end
|
||||||
|
|
||||||
-- Verify we found the signature
|
error("Could not find End of Central Directory signature")
|
||||||
if pos < minPos then
|
end
|
||||||
error("Could not find End of Central Directory signature")
|
|
||||||
end
|
|
||||||
|
|
||||||
|
--[=[
|
||||||
|
@within ZipReader
|
||||||
|
@interface EocdRecord
|
||||||
|
@private
|
||||||
|
|
||||||
|
A parsed End of Central Directory record.
|
||||||
|
|
||||||
|
@field diskNumber number -- The disk number
|
||||||
|
@field diskWithCD number -- The disk number of the disk with the Central Directory
|
||||||
|
@field cdEntries number -- The number of entries in the Central Directory
|
||||||
|
@field totalCDEntries number -- The total number of entries in the Central Directory
|
||||||
|
@field cdSize number -- The size of the Central Directory
|
||||||
|
@field cdOffset number -- The offset of the Central Directory
|
||||||
|
@field comment string -- The comment associated with the ZIP
|
||||||
|
]=]
|
||||||
|
export type EocdRecord = {
|
||||||
|
diskNumber: number,
|
||||||
|
diskWithCD: number,
|
||||||
|
cdEntries: number,
|
||||||
|
totalCDEntries: number,
|
||||||
|
cdSize: number,
|
||||||
|
cdOffset: number,
|
||||||
|
comment: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
--[=[
|
||||||
|
@within ZipReader
|
||||||
|
@method parseEocdRecord
|
||||||
|
@private
|
||||||
|
|
||||||
|
Parses the End of Central Directory record at the given position, usually located
|
||||||
|
using the [ZipReader:findEocdPosition].
|
||||||
|
|
||||||
|
**Errors if the ZIP file is invalid.**
|
||||||
|
|
||||||
|
@error "Invalid Central Directory offset or size"
|
||||||
|
|
||||||
|
@param pos number -- The offset to the End of Central Directory record
|
||||||
|
@return EocdRecord -- Structural representation of the parsed record
|
||||||
|
]=]
|
||||||
|
function ZipReader.parseEocdRecord(self: ZipReader, pos: number): EocdRecord
|
||||||
-- End of Central Directory format:
|
-- End of Central Directory format:
|
||||||
-- Offset Bytes Description
|
-- Offset Bytes Description
|
||||||
-- 0 4 End of central directory signature
|
-- 0 4 End of central directory signature
|
||||||
|
@ -470,19 +527,49 @@ function ZipReader.parseCentralDirectory(self: ZipReader): ()
|
||||||
-- 20 2 Comment length (n)
|
-- 20 2 Comment length (n)
|
||||||
-- 22 n Comment
|
-- 22 n Comment
|
||||||
|
|
||||||
local cdSize = buffer.readu32(self.data, pos + 12)
|
|
||||||
local cdEntries = buffer.readu16(self.data, pos + 10)
|
local cdEntries = buffer.readu16(self.data, pos + 10)
|
||||||
|
local cdSize = buffer.readu32(self.data, pos + 12)
|
||||||
local cdOffset = buffer.readu32(self.data, pos + 16)
|
local cdOffset = buffer.readu32(self.data, pos + 16)
|
||||||
|
|
||||||
-- Strict validation of CD boundaries and entry count
|
-- Validate CD boundaries and entry count
|
||||||
|
local bufSize = buffer.len(self.data)
|
||||||
if cdOffset >= bufSize or cdOffset + cdSize > bufSize then
|
if cdOffset >= bufSize or cdOffset + cdSize > bufSize then
|
||||||
error("Invalid Central Directory offset or size")
|
error("Invalid Central Directory offset or size")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
local commentLength = buffer.readu16(self.data, pos + 20)
|
||||||
|
return {
|
||||||
|
diskNumber = buffer.readu16(self.data, pos + 4),
|
||||||
|
diskWithCD = buffer.readu16(self.data, pos + 6),
|
||||||
|
cdEntries = cdEntries,
|
||||||
|
totalCDEntries = buffer.readu16(self.data, pos + 8),
|
||||||
|
cdSize = cdSize,
|
||||||
|
cdOffset = cdOffset,
|
||||||
|
comment = buffer.readstring(self.data, pos + 22, commentLength),
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
--[=[
|
||||||
|
@within ZipReader
|
||||||
|
@method parseCentralDirectory
|
||||||
|
@private
|
||||||
|
|
||||||
|
Parses the central directory of the ZIP file and populates the `entries` and `directories`
|
||||||
|
fields. Used internally during initialization of the [ZipReader].
|
||||||
|
|
||||||
|
**Errors if the ZIP file is invalid.**
|
||||||
|
|
||||||
|
@error "Invalid Central Directory entry signature"
|
||||||
|
@error "Found different entries than specified in Central Directory"
|
||||||
|
]=]
|
||||||
|
function ZipReader.parseCentralDirectory(self: ZipReader): ()
|
||||||
|
local eocdPos = self:findEocdPosition()
|
||||||
|
local record = self:parseEocdRecord(eocdPos)
|
||||||
|
|
||||||
-- Track actual entries found
|
-- Track actual entries found
|
||||||
local entriesFound = 0
|
local entriesFound = 0
|
||||||
pos = cdOffset
|
local pos = record.cdOffset
|
||||||
while pos < cdOffset + cdSize do
|
while pos < record.cdOffset + record.cdSize do
|
||||||
if buffer.readu32(self.data, pos) ~= SIGNATURES.CENTRAL_DIR then
|
if buffer.readu32(self.data, pos) ~= SIGNATURES.CENTRAL_DIR then
|
||||||
error("Invalid Central Directory entry signature")
|
error("Invalid Central Directory entry signature")
|
||||||
end
|
end
|
||||||
|
@ -540,12 +627,10 @@ function ZipReader.parseCentralDirectory(self: ZipReader): ()
|
||||||
entriesFound += 1
|
entriesFound += 1
|
||||||
end
|
end
|
||||||
|
|
||||||
if entriesFound ~= cdEntries then
|
if entriesFound ~= record.cdEntries then
|
||||||
error("Found different entries than specified in Central Directory")
|
error("Found different entries than specified in Central Directory")
|
||||||
end
|
end
|
||||||
|
self.comment = record.comment
|
||||||
local cdCommentLength = buffer.readu16(self.data, pos + 20)
|
|
||||||
self.comment = buffer.readstring(self.data, pos + 22, cdCommentLength)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
--[=[
|
--[=[
|
||||||
|
|
Loading…
Add table
Reference in a new issue