When CONFIG_SSE is enabled use the "prefetchnta" instruction to load the next chunk of data into the CPU cache. This only works when the input stream is covered by an MTRR. In case the input stream is read from the SPI ROM MMIO area it allows to keep the SPI controller busy fetching new data, which is automatically placed into the CPU cache, resulting in less I/O wait on the CPU side and faster decompression. When the input stream is not cachable the prefetch instruction has no effect. The SPI interfaces on the tested device runs at 100Mbit/s and the Sandy Bridge mobile CPU has quite some work to do decompressing the LZMA stream. That gives the SPI controller enough time to preload data into the cache. The payload of 1100213 bytes is now read in 164msec, resulting in an input bandwidth of 53MBit/s. TEST=Booted on Lenovo X220 and used cbmem -t: Before: 16:finished LZMA decompress (ignore for x86) 1,218,418 (210,054) After: 16:finished LZMA decompress (ignore for x86) 1,170,949 (164,868) Boots 46msec faster than before or 30% faster than before. Change-Id: I3b2ed7fe0883f271553ecd1ab4191e4848ad0299 Signed-off-by: Patrick Rudolph <patrick.rudolph@9elements.com> Reviewed-on: https://review.coreboot.org/c/coreboot/+/88813 Tested-by: build bot (Jenkins) <no-reply@coreboot.org> Reviewed-by: Angel Pons <th3fanbus@gmail.com>
457 lines
11 KiB
C
457 lines
11 KiB
C
/*
|
|
LzmaDecode.c
|
|
LZMA Decoder (optimized for Speed version)
|
|
|
|
LZMA SDK 4.40 Copyright (c) 1999-2006 Igor Pavlov (2006-05-01)
|
|
http://www.7-zip.org/
|
|
|
|
LZMA SDK is licensed under two licenses:
|
|
1) GNU Lesser General Public License (GNU LGPL)
|
|
2) Common Public License (CPL)
|
|
It means that you can select one of these two licenses and
|
|
follow rules of that license.
|
|
|
|
SPECIAL EXCEPTION:
|
|
Igor Pavlov, as the author of this Code, expressly permits you to
|
|
statically or dynamically link your Code (or bind by name) to the
|
|
interfaces of this file without subjecting your linked Code to the
|
|
terms of the CPL or GNU LGPL. Any modifications or additions
|
|
to this file, however, are subject to the LGPL or CPL terms.
|
|
*/
|
|
|
|
#if CONFIG(DECOMPRESS_OFAST)
|
|
#define __lzma_attribute_Ofast__ __attribute__((optimize("Ofast")))
|
|
#else
|
|
#define __lzma_attribute_Ofast__
|
|
#endif
|
|
|
|
/* When the input stream is covered by an MTRR the "prefetch" instruction
|
|
* will load the next chunk of data into the CPU cache ahead of time.
|
|
* On a 100MBit/s SPI interface this reduces the time spent in I/O wait
|
|
* by 5usec for every cache-line (64bytes) prefetched.
|
|
*/
|
|
#if CONFIG(SSE)
|
|
#define __lzma_prefetch(x) {asm volatile("prefetchnta %0" :: "m" (x));}
|
|
#else
|
|
#define __lzma_prefetch(x)
|
|
#endif
|
|
|
|
#include "lzmadecode.h"
|
|
#include <types.h>
|
|
|
|
#define kNumTopBits 24
|
|
#define kTopValue ((UInt32)1 << kNumTopBits)
|
|
|
|
#define kNumBitModelTotalBits 11
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits)
|
|
#define kNumMoveBits 5
|
|
|
|
/* Use sizeof(SizeT) sized reads whenever possible to avoid bad flash performance. Fall back
|
|
* to byte reads for last sizeof(SizeT) bytes since RC_TEST returns an error when BufferLim
|
|
* is *reached* (not surpassed!), meaning we can't allow that to happen while
|
|
* there are still bytes to decode from the algorithm's point of view. */
|
|
#define RC_READ_BYTE \
|
|
(look_ahead_ptr < sizeof(SizeT) ? look_ahead.raw[look_ahead_ptr++] \
|
|
: ((((uintptr_t) Buffer & (sizeof(SizeT) - 1)) \
|
|
|| ((SizeT) (BufferLim - Buffer) <= sizeof(SizeT))) ? (*Buffer++) \
|
|
: ((look_ahead.dw = *(SizeT *)Buffer), (Buffer += sizeof(SizeT)), \
|
|
(look_ahead_ptr = 1), look_ahead.raw[0])))
|
|
|
|
#define RC_INIT2 Code = 0; Range = 0xFFFFFFFF; \
|
|
{ \
|
|
int i; \
|
|
\
|
|
for (i = 0; i < 5; i++) { \
|
|
RC_TEST; \
|
|
Code = (Code << 8) | RC_READ_BYTE; \
|
|
} \
|
|
}
|
|
|
|
|
|
#define RC_TEST { if (Buffer == BufferLim) return LZMA_RESULT_DATA_ERROR; }
|
|
|
|
#define RC_INIT(buffer, bufferSize) Buffer = buffer; \
|
|
BufferLim = buffer + bufferSize; RC_INIT2
|
|
|
|
|
|
#define RC_NORMALIZE \
|
|
if (Range < kTopValue) { \
|
|
RC_TEST; \
|
|
Range <<= 8; \
|
|
Code = (Code << 8) | RC_READ_BYTE; \
|
|
if (!((uintptr_t)Buffer & 63)) { \
|
|
if ((BufferLim - Buffer) >= 128) { \
|
|
__lzma_prefetch(Buffer[64]); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
#define IfBit0(p) \
|
|
RC_NORMALIZE; \
|
|
bound = (Range >> kNumBitModelTotalBits) * *(p); \
|
|
if (Code < bound)
|
|
|
|
#define UpdateBit0(p) \
|
|
Range = bound; \
|
|
*(p) += (kBitModelTotal - *(p)) >> kNumMoveBits
|
|
|
|
#define UpdateBit1(p) \
|
|
Range -= bound; \
|
|
Code -= bound; \
|
|
*(p) -= (*(p)) >> kNumMoveBits
|
|
|
|
#define RC_GET_BIT2(p, mi, A0, A1) \
|
|
IfBit0(p) { \
|
|
UpdateBit0(p); \
|
|
mi <<= 1; \
|
|
A0; \
|
|
} else { \
|
|
UpdateBit1(p); \
|
|
mi = (mi + mi) + 1; \
|
|
A1; \
|
|
}
|
|
|
|
#define RC_GET_BIT(p, mi) RC_GET_BIT2(p, mi, ;, ;)
|
|
|
|
#define RangeDecoderBitTreeDecode(probs, numLevels, res) \
|
|
{ \
|
|
int i = numLevels; \
|
|
\
|
|
res = 1; \
|
|
do { \
|
|
CProb *cp = probs + res; \
|
|
RC_GET_BIT(cp, res) \
|
|
} while (--i != 0); \
|
|
res -= (1 << numLevels); \
|
|
}
|
|
|
|
|
|
#define kNumPosBitsMax 4
|
|
#define kNumPosStatesMax (1 << kNumPosBitsMax)
|
|
|
|
#define kLenNumLowBits 3
|
|
#define kLenNumLowSymbols (1 << kLenNumLowBits)
|
|
#define kLenNumMidBits 3
|
|
#define kLenNumMidSymbols (1 << kLenNumMidBits)
|
|
#define kLenNumHighBits 8
|
|
#define kLenNumHighSymbols (1 << kLenNumHighBits)
|
|
|
|
#define LenChoice 0
|
|
#define LenChoice2 (LenChoice + 1)
|
|
#define LenLow (LenChoice2 + 1)
|
|
#define LenMid (LenLow + (kNumPosStatesMax << kLenNumLowBits))
|
|
#define LenHigh (LenMid + (kNumPosStatesMax << kLenNumMidBits))
|
|
#define kNumLenProbs (LenHigh + kLenNumHighSymbols)
|
|
|
|
|
|
#define kNumStates 12
|
|
#define kNumLitStates 7
|
|
|
|
#define kStartPosModelIndex 4
|
|
#define kEndPosModelIndex 14
|
|
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
|
|
|
|
#define kNumPosSlotBits 6
|
|
#define kNumLenToPosStates 4
|
|
|
|
#define kNumAlignBits 4
|
|
#define kAlignTableSize (1 << kNumAlignBits)
|
|
|
|
#define kMatchMinLen 2
|
|
|
|
#define IsMatch 0
|
|
#define IsRep (IsMatch + (kNumStates << kNumPosBitsMax))
|
|
#define IsRepG0 (IsRep + kNumStates)
|
|
#define IsRepG1 (IsRepG0 + kNumStates)
|
|
#define IsRepG2 (IsRepG1 + kNumStates)
|
|
#define IsRep0Long (IsRepG2 + kNumStates)
|
|
#define PosSlot (IsRep0Long + (kNumStates << kNumPosBitsMax))
|
|
#define SpecPos (PosSlot + (kNumLenToPosStates << kNumPosSlotBits))
|
|
#define Align (SpecPos + kNumFullDistances - kEndPosModelIndex)
|
|
#define LenCoder (Align + kAlignTableSize)
|
|
#define RepLenCoder (LenCoder + kNumLenProbs)
|
|
#define Literal (RepLenCoder + kNumLenProbs)
|
|
|
|
#if Literal != LZMA_BASE_SIZE
|
|
StopCompilingDueBUG
|
|
#endif
|
|
|
|
int LzmaDecodeProperties(CLzmaProperties *propsRes,
|
|
const unsigned char *propsData, int size)
|
|
{
|
|
unsigned char prop0;
|
|
if (size < LZMA_PROPERTIES_SIZE)
|
|
return LZMA_RESULT_DATA_ERROR;
|
|
prop0 = propsData[0];
|
|
if (prop0 >= (9 * 5 * 5))
|
|
return LZMA_RESULT_DATA_ERROR;
|
|
{
|
|
for (propsRes->pb = 0; prop0 >= (9 * 5);
|
|
propsRes->pb++, prop0 -= (9 * 5))
|
|
;
|
|
for (propsRes->lp = 0; prop0 >= 9; propsRes->lp++, prop0 -= 9)
|
|
;
|
|
propsRes->lc = prop0;
|
|
/*
|
|
* unsigned char remainder = (unsigned char)(prop0 / 9);
|
|
* propsRes->lc = prop0 % 9;
|
|
* propsRes->pb = remainder / 5;
|
|
* propsRes->lp = remainder % 5;
|
|
*/
|
|
}
|
|
|
|
return LZMA_RESULT_OK;
|
|
}
|
|
|
|
#define kLzmaStreamWasFinishedId (-1)
|
|
|
|
__lzma_attribute_Ofast__
|
|
int LzmaDecode(CLzmaDecoderState *vs,
|
|
const unsigned char *inStream, SizeT inSize, SizeT *inSizeProcessed,
|
|
unsigned char *outStream, SizeT outSize, SizeT *outSizeProcessed)
|
|
{
|
|
CProb *p = vs->Probs;
|
|
SizeT nowPos = 0;
|
|
Byte previousByte = 0;
|
|
UInt32 posStateMask = (1 << (vs->Properties.pb)) - 1;
|
|
UInt32 literalPosMask = (1 << (vs->Properties.lp)) - 1;
|
|
int lc = vs->Properties.lc;
|
|
|
|
|
|
int state = 0;
|
|
UInt32 rep0 = 1, rep1 = 1, rep2 = 1, rep3 = 1;
|
|
int len = 0;
|
|
const Byte *Buffer;
|
|
const Byte *BufferLim;
|
|
int look_ahead_ptr = sizeof(SizeT);
|
|
union {
|
|
Byte raw[sizeof(SizeT)];
|
|
SizeT dw;
|
|
} look_ahead;
|
|
UInt32 Range;
|
|
UInt32 Code;
|
|
|
|
*inSizeProcessed = 0;
|
|
*outSizeProcessed = 0;
|
|
|
|
{
|
|
UInt32 i;
|
|
UInt32 numProbs = Literal + ((UInt32)LZMA_LIT_SIZE << (lc
|
|
+ vs->Properties.lp));
|
|
for (i = 0; i < numProbs; i++)
|
|
p[i] = kBitModelTotal >> 1;
|
|
}
|
|
|
|
RC_INIT(inStream, inSize);
|
|
|
|
|
|
while (nowPos < outSize) {
|
|
CProb *prob;
|
|
UInt32 bound;
|
|
int posState = (int)((nowPos)&posStateMask);
|
|
|
|
prob = p + IsMatch + (state << kNumPosBitsMax) + posState;
|
|
IfBit0(prob) {
|
|
int symbol = 1;
|
|
UpdateBit0(prob);
|
|
prob = p + Literal + (LZMA_LIT_SIZE *
|
|
((((nowPos) & literalPosMask) << lc)
|
|
+ (previousByte >> (8 - lc))));
|
|
|
|
if (state >= kNumLitStates) {
|
|
int matchByte;
|
|
matchByte = outStream[nowPos - rep0];
|
|
do {
|
|
int bit;
|
|
CProb *probLit;
|
|
matchByte <<= 1;
|
|
bit = (matchByte & 0x100);
|
|
probLit = prob + 0x100 + bit + symbol;
|
|
RC_GET_BIT2(probLit, symbol,
|
|
if (bit != 0)
|
|
break,
|
|
if (bit == 0)
|
|
break)
|
|
} while (symbol < 0x100);
|
|
}
|
|
while (symbol < 0x100) {
|
|
CProb *probLit = prob + symbol;
|
|
RC_GET_BIT(probLit, symbol)
|
|
}
|
|
previousByte = (Byte)symbol;
|
|
|
|
outStream[nowPos++] = previousByte;
|
|
if (state < 4)
|
|
state = 0;
|
|
else if (state < 10)
|
|
state -= 3;
|
|
else
|
|
state -= 6;
|
|
} else {
|
|
UpdateBit1(prob);
|
|
prob = p + IsRep + state;
|
|
IfBit0(prob) {
|
|
UpdateBit0(prob);
|
|
rep3 = rep2;
|
|
rep2 = rep1;
|
|
rep1 = rep0;
|
|
state = state < kNumLitStates ? 0 : 3;
|
|
prob = p + LenCoder;
|
|
} else {
|
|
UpdateBit1(prob);
|
|
prob = p + IsRepG0 + state;
|
|
IfBit0(prob) {
|
|
UpdateBit0(prob);
|
|
prob = p + IsRep0Long
|
|
+ (state << kNumPosBitsMax)
|
|
+ posState;
|
|
IfBit0(prob) {
|
|
UpdateBit0(prob);
|
|
|
|
if (nowPos == 0)
|
|
return LZMA_RESULT_DATA_ERROR;
|
|
|
|
state = state < kNumLitStates
|
|
? 9 : 11;
|
|
previousByte = outStream[nowPos
|
|
- rep0];
|
|
outStream[nowPos++] =
|
|
previousByte;
|
|
|
|
continue;
|
|
} else {
|
|
UpdateBit1(prob);
|
|
}
|
|
} else {
|
|
UInt32 distance;
|
|
UpdateBit1(prob);
|
|
prob = p + IsRepG1 + state;
|
|
IfBit0(prob) {
|
|
UpdateBit0(prob);
|
|
distance = rep1;
|
|
} else {
|
|
UpdateBit1(prob);
|
|
prob = p + IsRepG2 + state;
|
|
IfBit0(prob) {
|
|
UpdateBit0(prob);
|
|
distance = rep2;
|
|
} else {
|
|
UpdateBit1(prob);
|
|
distance = rep3;
|
|
rep3 = rep2;
|
|
}
|
|
rep2 = rep1;
|
|
}
|
|
rep1 = rep0;
|
|
rep0 = distance;
|
|
}
|
|
state = state < kNumLitStates ? 8 : 11;
|
|
prob = p + RepLenCoder;
|
|
}
|
|
{
|
|
int numBits, offset;
|
|
CProb *probLen = prob + LenChoice;
|
|
IfBit0(probLen) {
|
|
UpdateBit0(probLen);
|
|
probLen = prob + LenLow
|
|
+ (posState << kLenNumLowBits);
|
|
offset = 0;
|
|
numBits = kLenNumLowBits;
|
|
} else {
|
|
UpdateBit1(probLen);
|
|
probLen = prob + LenChoice2;
|
|
IfBit0(probLen) {
|
|
UpdateBit0(probLen);
|
|
probLen = prob + LenMid
|
|
+ (posState <<
|
|
kLenNumMidBits);
|
|
offset = kLenNumLowSymbols;
|
|
numBits = kLenNumMidBits;
|
|
} else {
|
|
UpdateBit1(probLen);
|
|
probLen = prob + LenHigh;
|
|
offset = kLenNumLowSymbols
|
|
+ kLenNumMidSymbols;
|
|
numBits = kLenNumHighBits;
|
|
}
|
|
}
|
|
RangeDecoderBitTreeDecode(probLen, numBits,
|
|
len);
|
|
len += offset;
|
|
}
|
|
|
|
if (state < 4) {
|
|
int posSlot;
|
|
state += kNumLitStates;
|
|
prob = p + PosSlot +
|
|
((len < kNumLenToPosStates ? len :
|
|
kNumLenToPosStates - 1) <<
|
|
kNumPosSlotBits);
|
|
RangeDecoderBitTreeDecode(prob, kNumPosSlotBits,
|
|
posSlot);
|
|
if (posSlot >= kStartPosModelIndex) {
|
|
int numDirectBits = ((posSlot >> 1)
|
|
- 1);
|
|
rep0 = (2 | ((UInt32)posSlot & 1));
|
|
if (posSlot < kEndPosModelIndex) {
|
|
rep0 <<= numDirectBits;
|
|
prob = p + SpecPos + rep0
|
|
- posSlot - 1;
|
|
} else {
|
|
numDirectBits -= kNumAlignBits;
|
|
do {
|
|
RC_NORMALIZE
|
|
Range >>= 1;
|
|
rep0 <<= 1;
|
|
if (Code >= Range) {
|
|
Code -= Range;
|
|
rep0 |= 1;
|
|
}
|
|
} while (--numDirectBits != 0);
|
|
prob = p + Align;
|
|
rep0 <<= kNumAlignBits;
|
|
numDirectBits = kNumAlignBits;
|
|
}
|
|
{
|
|
int i = 1;
|
|
int mi = 1;
|
|
do {
|
|
CProb *prob3 = prob
|
|
+ mi;
|
|
RC_GET_BIT2(prob3, mi,
|
|
;, rep0 |= i);
|
|
i <<= 1;
|
|
} while (--numDirectBits != 0);
|
|
}
|
|
} else
|
|
rep0 = posSlot;
|
|
if (++rep0 == (UInt32)(0)) {
|
|
/* it's for stream version */
|
|
len = kLzmaStreamWasFinishedId;
|
|
break;
|
|
}
|
|
}
|
|
|
|
len += kMatchMinLen;
|
|
if (rep0 > nowPos)
|
|
return LZMA_RESULT_DATA_ERROR;
|
|
|
|
|
|
do {
|
|
previousByte = outStream[nowPos - rep0];
|
|
len--;
|
|
outStream[nowPos++] = previousByte;
|
|
} while (len != 0 && nowPos < outSize);
|
|
}
|
|
}
|
|
RC_NORMALIZE;
|
|
/*
|
|
* Tell static analysis we know len can have a dead assignment.
|
|
*/
|
|
(void)len;
|
|
|
|
|
|
*inSizeProcessed = (SizeT)(Buffer - inStream);
|
|
*outSizeProcessed = nowPos;
|
|
return LZMA_RESULT_OK;
|
|
}
|