coreboot/src/lib/lzmadecode.c
Patrick Rudolph 159afbc5d5 lib/lzmadecode: Increase decoding speed by 30%
When CONFIG_SSE is enabled use the "prefetchnta" instruction to load
the next chunk of data into the CPU cache. This only works when the
input stream is covered by an MTRR. In case the input stream is read
from the SPI ROM MMIO area it allows to keep the SPI controller
busy fetching new data, which is automatically placed into the CPU
cache, resulting in less I/O wait on the CPU side and faster
decompression.

When the input stream is not cachable the prefetch instruction has no
effect.

The SPI interfaces on the tested device runs at 100Mbit/s and the
Sandy Bridge mobile CPU has quite some work to do decompressing the
LZMA stream.
That gives the SPI controller enough time to preload data into the
cache.

The payload of 1100213 bytes is now read in 164msec, resulting in an
input bandwidth of 53MBit/s.

TEST=Booted on Lenovo X220 and used cbmem -t:
Before:
  16:finished LZMA decompress (ignore for x86)   1,218,418 (210,054)
After:
  16:finished LZMA decompress (ignore for x86)   1,170,949 (164,868)

Boots 46msec faster than before or 30% faster than before.

Change-Id: I3b2ed7fe0883f271553ecd1ab4191e4848ad0299
Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com>
Reviewed-on: https://review.coreboot.org/c/coreboot/+/88813
Tested-by: build bot (Jenkins) <no-reply@coreboot.org>
Reviewed-by: Angel Pons <th3fanbus@gmail.com>
2025-09-09 14:40:45 +00:00

457 lines
11 KiB
C

/*
LzmaDecode.c
LZMA Decoder (optimized for Speed version)
LZMA SDK 4.40 Copyright (c) 1999-2006 Igor Pavlov (2006-05-01)
http://www.7-zip.org/
LZMA SDK is licensed under two licenses:
1) GNU Lesser General Public License (GNU LGPL)
2) Common Public License (CPL)
It means that you can select one of these two licenses and
follow rules of that license.
SPECIAL EXCEPTION:
Igor Pavlov, as the author of this Code, expressly permits you to
statically or dynamically link your Code (or bind by name) to the
interfaces of this file without subjecting your linked Code to the
terms of the CPL or GNU LGPL. Any modifications or additions
to this file, however, are subject to the LGPL or CPL terms.
*/
#if CONFIG(DECOMPRESS_OFAST)
#define __lzma_attribute_Ofast__ __attribute__((optimize("Ofast")))
#else
#define __lzma_attribute_Ofast__
#endif
/* When the input stream is covered by an MTRR the "prefetch" instruction
* will load the next chunk of data into the CPU cache ahead of time.
* On a 100MBit/s SPI interface this reduces the time spent in I/O wait
* by 5usec for every cache-line (64bytes) prefetched.
*/
#if CONFIG(SSE)
#define __lzma_prefetch(x) {asm volatile("prefetchnta %0" :: "m" (x));}
#else
#define __lzma_prefetch(x)
#endif
#include "lzmadecode.h"
#include <types.h>
#define kNumTopBits 24
#define kTopValue ((UInt32)1 << kNumTopBits)
#define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumMoveBits 5
/* Use sizeof(SizeT) sized reads whenever possible to avoid bad flash performance. Fall back
* to byte reads for last sizeof(SizeT) bytes since RC_TEST returns an error when BufferLim
* is *reached* (not surpassed!), meaning we can't allow that to happen while
* there are still bytes to decode from the algorithm's point of view. */
#define RC_READ_BYTE \
(look_ahead_ptr < sizeof(SizeT) ? look_ahead.raw[look_ahead_ptr++] \
: ((((uintptr_t) Buffer & (sizeof(SizeT) - 1)) \
|| ((SizeT) (BufferLim - Buffer) <= sizeof(SizeT))) ? (*Buffer++) \
: ((look_ahead.dw = *(SizeT *)Buffer), (Buffer += sizeof(SizeT)), \
(look_ahead_ptr = 1), look_ahead.raw[0])))
#define RC_INIT2 Code = 0; Range = 0xFFFFFFFF; \
{ \
int i; \
\
for (i = 0; i < 5; i++) { \
RC_TEST; \
Code = (Code << 8) | RC_READ_BYTE; \
} \
}
#define RC_TEST { if (Buffer == BufferLim) return LZMA_RESULT_DATA_ERROR; }
#define RC_INIT(buffer, bufferSize) Buffer = buffer; \
BufferLim = buffer + bufferSize; RC_INIT2
#define RC_NORMALIZE \
if (Range < kTopValue) { \
RC_TEST; \
Range <<= 8; \
Code = (Code << 8) | RC_READ_BYTE; \
if (!((uintptr_t)Buffer & 63)) { \
if ((BufferLim - Buffer) >= 128) { \
__lzma_prefetch(Buffer[64]); \
} \
} \
}
#define IfBit0(p) \
RC_NORMALIZE; \
bound = (Range >> kNumBitModelTotalBits) * *(p); \
if (Code < bound)
#define UpdateBit0(p) \
Range = bound; \
*(p) += (kBitModelTotal - *(p)) >> kNumMoveBits
#define UpdateBit1(p) \
Range -= bound; \
Code -= bound; \
*(p) -= (*(p)) >> kNumMoveBits
#define RC_GET_BIT2(p, mi, A0, A1) \
IfBit0(p) { \
UpdateBit0(p); \
mi <<= 1; \
A0; \
} else { \
UpdateBit1(p); \
mi = (mi + mi) + 1; \
A1; \
}
#define RC_GET_BIT(p, mi) RC_GET_BIT2(p, mi, ;, ;)
#define RangeDecoderBitTreeDecode(probs, numLevels, res) \
{ \
int i = numLevels; \
\
res = 1; \
do { \
CProb *cp = probs + res; \
RC_GET_BIT(cp, res) \
} while (--i != 0); \
res -= (1 << numLevels); \
}
#define kNumPosBitsMax 4
#define kNumPosStatesMax (1 << kNumPosBitsMax)
#define kLenNumLowBits 3
#define kLenNumLowSymbols (1 << kLenNumLowBits)
#define kLenNumMidBits 3
#define kLenNumMidSymbols (1 << kLenNumMidBits)
#define kLenNumHighBits 8
#define kLenNumHighSymbols (1 << kLenNumHighBits)
#define LenChoice 0
#define LenChoice2 (LenChoice + 1)
#define LenLow (LenChoice2 + 1)
#define LenMid (LenLow + (kNumPosStatesMax << kLenNumLowBits))
#define LenHigh (LenMid + (kNumPosStatesMax << kLenNumMidBits))
#define kNumLenProbs (LenHigh + kLenNumHighSymbols)
#define kNumStates 12
#define kNumLitStates 7
#define kStartPosModelIndex 4
#define kEndPosModelIndex 14
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
#define kNumPosSlotBits 6
#define kNumLenToPosStates 4
#define kNumAlignBits 4
#define kAlignTableSize (1 << kNumAlignBits)
#define kMatchMinLen 2
#define IsMatch 0
#define IsRep (IsMatch + (kNumStates << kNumPosBitsMax))
#define IsRepG0 (IsRep + kNumStates)
#define IsRepG1 (IsRepG0 + kNumStates)
#define IsRepG2 (IsRepG1 + kNumStates)
#define IsRep0Long (IsRepG2 + kNumStates)
#define PosSlot (IsRep0Long + (kNumStates << kNumPosBitsMax))
#define SpecPos (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
#define Align (SpecPos + kNumFullDistances - kEndPosModelIndex)
#define LenCoder (Align + kAlignTableSize)
#define RepLenCoder (LenCoder + kNumLenProbs)
#define Literal (RepLenCoder + kNumLenProbs)
#if Literal != LZMA_BASE_SIZE
StopCompilingDueBUG
#endif
int LzmaDecodeProperties(CLzmaProperties *propsRes,
const unsigned char *propsData, int size)
{
unsigned char prop0;
if (size < LZMA_PROPERTIES_SIZE)
return LZMA_RESULT_DATA_ERROR;
prop0 = propsData[0];
if (prop0 >= (9 * 5 * 5))
return LZMA_RESULT_DATA_ERROR;
{
for (propsRes->pb = 0; prop0 >= (9 * 5);
propsRes->pb++, prop0 -= (9 * 5))
;
for (propsRes->lp = 0; prop0 >= 9; propsRes->lp++, prop0 -= 9)
;
propsRes->lc = prop0;
/*
* unsigned char remainder = (unsigned char)(prop0 / 9);
* propsRes->lc = prop0 % 9;
* propsRes->pb = remainder / 5;
* propsRes->lp = remainder % 5;
*/
}
return LZMA_RESULT_OK;
}
#define kLzmaStreamWasFinishedId (-1)
__lzma_attribute_Ofast__
int LzmaDecode(CLzmaDecoderState *vs,
const unsigned char *inStream, SizeT inSize, SizeT *inSizeProcessed,
unsigned char *outStream, SizeT outSize, SizeT *outSizeProcessed)
{
CProb *p = vs->Probs;
SizeT nowPos = 0;
Byte previousByte = 0;
UInt32 posStateMask = (1 << (vs->Properties.pb)) - 1;
UInt32 literalPosMask = (1 << (vs->Properties.lp)) - 1;
int lc = vs->Properties.lc;
int state = 0;
UInt32 rep0 = 1, rep1 = 1, rep2 = 1, rep3 = 1;
int len = 0;
const Byte *Buffer;
const Byte *BufferLim;
int look_ahead_ptr = sizeof(SizeT);
union {
Byte raw[sizeof(SizeT)];
SizeT dw;
} look_ahead;
UInt32 Range;
UInt32 Code;
*inSizeProcessed = 0;
*outSizeProcessed = 0;
{
UInt32 i;
UInt32 numProbs = Literal + ((UInt32)LZMA_LIT_SIZE << (lc
+ vs->Properties.lp));
for (i = 0; i < numProbs; i++)
p[i] = kBitModelTotal >> 1;
}
RC_INIT(inStream, inSize);
while (nowPos < outSize) {
CProb *prob;
UInt32 bound;
int posState = (int)((nowPos)&posStateMask);
prob = p + IsMatch + (state << kNumPosBitsMax) + posState;
IfBit0(prob) {
int symbol = 1;
UpdateBit0(prob);
prob = p + Literal + (LZMA_LIT_SIZE *
((((nowPos) & literalPosMask) << lc)
+ (previousByte >> (8 - lc))));
if (state >= kNumLitStates) {
int matchByte;
matchByte = outStream[nowPos - rep0];
do {
int bit;
CProb *probLit;
matchByte <<= 1;
bit = (matchByte & 0x100);
probLit = prob + 0x100 + bit + symbol;
RC_GET_BIT2(probLit, symbol,
if (bit != 0)
break,
if (bit == 0)
break)
} while (symbol < 0x100);
}
while (symbol < 0x100) {
CProb *probLit = prob + symbol;
RC_GET_BIT(probLit, symbol)
}
previousByte = (Byte)symbol;
outStream[nowPos++] = previousByte;
if (state < 4)
state = 0;
else if (state < 10)
state -= 3;
else
state -= 6;
} else {
UpdateBit1(prob);
prob = p + IsRep + state;
IfBit0(prob) {
UpdateBit0(prob);
rep3 = rep2;
rep2 = rep1;
rep1 = rep0;
state = state < kNumLitStates ? 0 : 3;
prob = p + LenCoder;
} else {
UpdateBit1(prob);
prob = p + IsRepG0 + state;
IfBit0(prob) {
UpdateBit0(prob);
prob = p + IsRep0Long
+ (state << kNumPosBitsMax)
+ posState;
IfBit0(prob) {
UpdateBit0(prob);
if (nowPos == 0)
return LZMA_RESULT_DATA_ERROR;
state = state < kNumLitStates
? 9 : 11;
previousByte = outStream[nowPos
- rep0];
outStream[nowPos++] =
previousByte;
continue;
} else {
UpdateBit1(prob);
}
} else {
UInt32 distance;
UpdateBit1(prob);
prob = p + IsRepG1 + state;
IfBit0(prob) {
UpdateBit0(prob);
distance = rep1;
} else {
UpdateBit1(prob);
prob = p + IsRepG2 + state;
IfBit0(prob) {
UpdateBit0(prob);
distance = rep2;
} else {
UpdateBit1(prob);
distance = rep3;
rep3 = rep2;
}
rep2 = rep1;
}
rep1 = rep0;
rep0 = distance;
}
state = state < kNumLitStates ? 8 : 11;
prob = p + RepLenCoder;
}
{
int numBits, offset;
CProb *probLen = prob + LenChoice;
IfBit0(probLen) {
UpdateBit0(probLen);
probLen = prob + LenLow
+ (posState << kLenNumLowBits);
offset = 0;
numBits = kLenNumLowBits;
} else {
UpdateBit1(probLen);
probLen = prob + LenChoice2;
IfBit0(probLen) {
UpdateBit0(probLen);
probLen = prob + LenMid
+ (posState <<
kLenNumMidBits);
offset = kLenNumLowSymbols;
numBits = kLenNumMidBits;
} else {
UpdateBit1(probLen);
probLen = prob + LenHigh;
offset = kLenNumLowSymbols
+ kLenNumMidSymbols;
numBits = kLenNumHighBits;
}
}
RangeDecoderBitTreeDecode(probLen, numBits,
len);
len += offset;
}
if (state < 4) {
int posSlot;
state += kNumLitStates;
prob = p + PosSlot +
((len < kNumLenToPosStates ? len :
kNumLenToPosStates - 1) <<
kNumPosSlotBits);
RangeDecoderBitTreeDecode(prob, kNumPosSlotBits,
posSlot);
if (posSlot >= kStartPosModelIndex) {
int numDirectBits = ((posSlot >> 1)
- 1);
rep0 = (2 | ((UInt32)posSlot & 1));
if (posSlot < kEndPosModelIndex) {
rep0 <<= numDirectBits;
prob = p + SpecPos + rep0
- posSlot - 1;
} else {
numDirectBits -= kNumAlignBits;
do {
RC_NORMALIZE
Range >>= 1;
rep0 <<= 1;
if (Code >= Range) {
Code -= Range;
rep0 |= 1;
}
} while (--numDirectBits != 0);
prob = p + Align;
rep0 <<= kNumAlignBits;
numDirectBits = kNumAlignBits;
}
{
int i = 1;
int mi = 1;
do {
CProb *prob3 = prob
+ mi;
RC_GET_BIT2(prob3, mi,
;, rep0 |= i);
i <<= 1;
} while (--numDirectBits != 0);
}
} else
rep0 = posSlot;
if (++rep0 == (UInt32)(0)) {
/* it's for stream version */
len = kLzmaStreamWasFinishedId;
break;
}
}
len += kMatchMinLen;
if (rep0 > nowPos)
return LZMA_RESULT_DATA_ERROR;
do {
previousByte = outStream[nowPos - rep0];
len--;
outStream[nowPos++] = previousByte;
} while (len != 0 && nowPos < outSize);
}
}
RC_NORMALIZE;
/*
* Tell static analysis we know len can have a dead assignment.
*/
(void)len;
*inSizeProcessed = (SizeT)(Buffer - inStream);
*outSizeProcessed = nowPos;
return LZMA_RESULT_OK;
}