@@ -39,15 +39,11 @@ inline uint32_t H(uint32_t x, uint32_t y, uint32_t z) { return x ^ y ^ z; }
3939inline uint32_t I (uint32_t x, uint32_t y, uint32_t z) { return y ^ (x | ~z); }
4040
4141// Process a single 64-byte block
42- void transform (const uint8_t * block, uint32_t & a0, uint32_t & b0, uint32_t & c0, uint32_t & d0) {
43- uint32_t M[16 ];
44- // Decode 64 bytes into 16 words (little-endian)
45- for (int j = 0 ; j < 16 ; ++j) {
46- M[j] = block[j * 4 ] |
47- (block[j * 4 + 1 ] << 8 ) |
48- (block[j * 4 + 2 ] << 16 ) |
49- (block[j * 4 + 3 ] << 24 );
50- }
42+ #pragma GCC optimize("unroll-loops")
43+ __attribute__ ((always_inline))
44+ inline void transform(const uint8_t * block, uint32_t & a0, uint32_t & b0, uint32_t & c0, uint32_t & d0) {
45+ // Direct zero-copy access (Little Endian)
46+ const uint32_t * M = reinterpret_cast <const uint32_t *>(block);
5147
5248 uint32_t A = a0;
5349 uint32_t B = b0;
@@ -113,85 +109,91 @@ int main(int argc, char* argv[]) {
113109 uint32_t d0 = 0x10325476 ;
114110
115111 uint64_t totalBytes = 0 ;
116- uint8_t buffer[64 ];
112+
113+ // 4MB buffer (reduced loop overhead)
114+ const size_t BUFFER_SIZE = 4 * 1024 * 1024 ;
115+ vector<uint8_t > buffer (BUFFER_SIZE);
117116
118117 // Report initial progress
119118 if (totalExpectedSize > 0 ) reportProgress (0 , totalExpectedSize);
120119
121- // Read from stdin in 64-byte chunks
122- while ( cin.read ((char *)buffer, 64 )) {
123- totalBytes += 64 ;
124- transform (buffer, a0, b0, c0, d0) ;
120+ while (cin) {
121+ cin.read ((char *)buffer. data (), BUFFER_SIZE);
122+ size_t bytesRead = cin. gcount () ;
123+ if (bytesRead == 0 ) break ;
125124
126- // Report progress periodically
125+ size_t offset = 0 ;
126+ while (offset + 64 <= bytesRead) {
127+ transform (buffer.data () + offset, a0, b0, c0, d0);
128+ offset += 64 ;
129+ totalBytes += 64 ;
130+ }
131+
132+ // Report progress
127133 if (totalExpectedSize > 0 ) {
128134 reportProgress (totalBytes, totalExpectedSize);
129135 }
130- }
131-
132- // Handle remaining bytes
133- size_t bytesRead = cin.gcount ();
134- totalBytes += bytesRead;
135-
136- // Padding
137- uint8_t padding[128 ]; // Max padding needed is 64 + 8 = 72 bytes, but we might cross block boundary
138- memset (padding, 0 , 128 );
139-
140- // Copy remaining bytes to padding buffer
141- memcpy (padding, buffer, bytesRead);
142-
143- // Add '1' bit
144- padding[bytesRead] = 0x80 ;
145-
146- size_t paddingLen;
147- if (bytesRead < 56 ) {
148- paddingLen = 56 - bytesRead;
149- } else {
150- paddingLen = 120 - bytesRead;
151- }
152-
153- // Add length (bits) at the end of the last block
154- uint64_t totalBits = totalBytes * 8 ;
155- size_t lengthOffset = bytesRead + paddingLen + 8 - 8 ; // Position for length
156-
157- // If we crossed a block boundary, we process the first block
158- if (bytesRead >= 56 ) {
159- transform (padding, a0, b0, c0, d0);
160- // Move to next block for length
161- lengthOffset = 64 - 8 ; // End of second block
162- // We need to put length at the end of the SECOND block (index 56-63 relative to second block start)
163- // But wait, my padding logic above is slightly complex. Let's simplify.
164- }
165-
166- // Let's redo padding logic to be cleaner
167- // We have 'bytesRead' bytes in 'buffer'.
168- // We copy them to a temp buffer that can hold up to 2 blocks (128 bytes)
169- uint8_t finalBlock[128 ];
170- memset (finalBlock, 0 , 128 );
171- memcpy (finalBlock, buffer, bytesRead);
172-
173- finalBlock[bytesRead] = 0x80 ;
174-
175- if (bytesRead < 56 ) {
176- // Fits in one block
177- // Append length at bytes 56-63
178- for (int i = 0 ; i < 8 ; ++i) {
179- finalBlock[56 + i] = (totalBits >> (i * 8 )) & 0xFF ;
180- }
181- transform (finalBlock, a0, b0, c0, d0);
182- } else {
183- // Need two blocks
184- // First block is padded with 0s after 0x80
185- transform (finalBlock, a0, b0, c0, d0);
186136
187- // Second block has length at end
188- memset (finalBlock, 0 , 64 ); // Clear first block content
189- // Length at bytes 56-63 of second block (which is now at index 56 of finalBlock array if we reused it,
190- // but we just cleared it so it's effectively index 56)
191- for (int i = 0 ; i < 8 ; ++i) {
192- finalBlock[56 + i] = (totalBits >> (i * 8 )) & 0xFF ;
137+ // Handle remaining bytes (partial block at end of buffer)
138+ // If we have remaining bytes, they must be the end of the file
139+ // because we read in multiples of 64 unless EOF.
140+ // Wait, 1MB is multiple of 64, so this only happens at EOF.
141+ if (offset < bytesRead) {
142+ // This is the last partial block
143+ // We need to handle padding here
144+ size_t remaining = bytesRead - offset;
145+
146+ // Copy remaining to a temp buffer for padding
147+ uint8_t finalBlock[128 ];
148+ memset (finalBlock, 0 , 128 );
149+ memcpy (finalBlock, buffer.data () + offset, remaining);
150+
151+ totalBytes += remaining;
152+
153+ // Add '1' bit
154+ finalBlock[remaining] = 0x80 ;
155+
156+ uint64_t totalBits = totalBytes * 8 ;
157+
158+ if (remaining < 56 ) {
159+ // Fits in one block
160+ for (int i = 0 ; i < 8 ; ++i) {
161+ finalBlock[56 + i] = (totalBits >> (i * 8 )) & 0xFF ;
162+ }
163+ transform (finalBlock, a0, b0, c0, d0);
164+ } else {
165+ // Need two blocks
166+ transform (finalBlock, a0, b0, c0, d0);
167+
168+ memset (finalBlock, 0 , 64 );
169+ for (int i = 0 ; i < 8 ; ++i) {
170+ finalBlock[56 + i] = (totalBits >> (i * 8 )) & 0xFF ;
171+ }
172+ transform (finalBlock, a0, b0, c0, d0);
173+ }
174+ break ; // Done
193175 }
194- transform (finalBlock, a0, b0, c0, d0);
176+ }
177+
178+ // If exact multiple of 64 bytes, we still need padding
179+ if (totalBytes % 64 == 0 ) {
180+ uint8_t finalBlock[64 ];
181+ memset (finalBlock, 0 , 64 );
182+ finalBlock[0 ] = 0x80 ;
183+
184+ uint64_t totalBits = totalBytes * 8 ;
185+
186+ // If 0x80 leaves enough room for length (56 bytes)
187+ // 0 < 56, so yes.
188+ // Wait, if totalBytes % 64 == 0, we are at start of new block.
189+ // So we have 0 bytes of data in this block.
190+ // 0 bytes data + 1 byte (0x80) = 1 byte used.
191+ // 1 < 56, so we fit in one block.
192+
193+ for (int i = 0 ; i < 8 ; ++i) {
194+ finalBlock[56 + i] = (totalBits >> (i * 8 )) & 0xFF ;
195+ }
196+ transform (finalBlock, a0, b0, c0, d0);
195197 }
196198
197199 // Output
0 commit comments