main.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124
  1. /*
  2. * Brute force collision tester for 64-bit hashes
  3. * Part of the xxHash project
  4. * Copyright (C) 2019-2021 Yann Collet
  5. *
  6. * GPL v2 License
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21. *
  22. * You can contact the author at:
  23. * - xxHash homepage: https://www.xxhash.com
  24. * - xxHash source repository: https://github.com/Cyan4973/xxHash
  25. */
  26. /*
  27. * The collision tester will generate 24 billion hashes (by default),
  28. * and count how many collisions were produced by the 64-bit hash algorithm.
  29. * The optimal amount of collisions for 64-bit is ~18 collisions.
  30. * A good hash should be close to this figure.
  31. *
  32. * This program requires a lot of memory:
  33. * - Either store hash values directly => 192 GB
  34. * - Or use a filter:
  35. * - 32 GB (by default) for the filter itself
  36. * - + ~14 GB for the list of hashes (depending on the filter's outcome)
  37. * Due to these memory constraints, it requires a 64-bit system.
  38. */
  39. /* === Dependencies === */
  40. #include <stdint.h> /* uint64_t */
  41. #include <stdlib.h> /* malloc, free, qsort, exit */
  42. #include <string.h> /* memset */
  43. #include <stdio.h> /* printf, fflush */
  44. #undef NDEBUG /* ensure assert is _not_ disabled */
  45. #include <assert.h>
  46. #include "hashes.h" /* UniHash, hashfn, hashfnTable */
  47. #include "sort.hh" /* sort64 */
  48. typedef enum { ht32, ht64, ht128 } Htype_e;
  49. /* === Debug === */
  50. #define EXIT(...) { printf(__VA_ARGS__); printf("\n"); exit(1); }
  51. static void hexRaw(const void* buffer, size_t size)
  52. {
  53. const unsigned char* p = (const unsigned char*)buffer;
  54. for (size_t i=0; i<size; i++) {
  55. printf("%02X", p[i]);
  56. }
  57. }
  58. void printHash(const void* table, size_t n, Htype_e htype)
  59. {
  60. if ((htype == ht64) || (htype == ht32)){
  61. uint64_t const h64 = ((const uint64_t*)table)[n];
  62. hexRaw(&h64, sizeof(h64));
  63. } else {
  64. assert(htype == ht128);
  65. XXH128_hash_t const h128 = ((const XXH128_hash_t*)table)[n];
  66. hexRaw(&h128, sizeof(h128));
  67. }
  68. }
  69. /* === Generate Random unique Samples to hash === */
  70. /*
  71. * These functions will generate and update a sample to hash.
  72. * initSample() will fill a buffer with random bytes,
  73. * updateSample() will modify one slab in the input buffer.
  74. * updateSample() guarantees it will produce unique samples,
  75. * but it needs to know the total number of samples.
  76. */
  77. static const uint64_t prime64_1 = 11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
  78. static const uint64_t prime64_2 = 14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
  79. static const uint64_t prime64_3 = 1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
  80. static uint64_t avalanche64(uint64_t h64)
  81. {
  82. h64 ^= h64 >> 33;
  83. h64 *= prime64_2;
  84. h64 ^= h64 >> 29;
  85. h64 *= prime64_3;
  86. h64 ^= h64 >> 32;
  87. return h64;
  88. }
  89. static unsigned char randomByte(uint64_t n)
  90. {
  91. uint64_t n64 = avalanche64(n+1);
  92. n64 *= prime64_1;
  93. return (unsigned char)(n64 >> 56);
  94. }
  95. typedef enum { sf_slab5, sf_sparse } sf_genMode;
  96. #ifdef SLAB5
  97. /*
  98. * Slab5 sample generation.
  99. * This algorithm generates unique inputs flipping on average 16 bits per candidate.
  100. * It is generally much more friendly for most hash algorithms, especially
  101. * weaker ones, as it shuffles more the input.
  102. * The algorithm also avoids overfitting the per4 or per8 ingestion patterns.
  103. */
  104. #define SLAB_SIZE 5
  105. typedef struct {
  106. void* buffer;
  107. size_t size;
  108. sf_genMode mode;
  109. size_t prngSeed;
  110. uint64_t hnb;
  111. } sampleFactory;
  112. static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
  113. {
  114. uint64_t const minNbSlabs = ((htotal-1) >> 32) + 1;
  115. uint64_t const minSize = minNbSlabs * SLAB_SIZE;
  116. if (sf->size < minSize)
  117. EXIT("sample size must be >= %i bytes for this amount of hashes",
  118. (int)minSize);
  119. unsigned char* const p = (unsigned char*)sf->buffer;
  120. for (size_t n=0; n < sf->size; n++)
  121. p[n] = randomByte(n);
  122. sf->hnb = 0;
  123. }
  124. static sampleFactory*
  125. create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
  126. {
  127. sampleFactory* const sf = malloc(sizeof(sampleFactory));
  128. if (!sf) EXIT("not enough memory");
  129. void* const buffer = malloc(size);
  130. if (!buffer) EXIT("not enough memory");
  131. sf->buffer = buffer;
  132. sf->size = size;
  133. sf->mode = sf_slab5;
  134. sf->prngSeed = seed;
  135. init_sampleFactory(sf, htotal);
  136. return sf;
  137. }
  138. static void free_sampleFactory(sampleFactory* sf)
  139. {
  140. if (!sf) return;
  141. free(sf->buffer);
  142. free(sf);
  143. }
  144. static inline void update_sampleFactory(sampleFactory* sf)
  145. {
  146. size_t const nbSlabs = sf->size / SLAB_SIZE;
  147. size_t const SlabNb = sf->hnb % nbSlabs;
  148. sf->hnb++;
  149. char* const ptr = (char*)sf->buffer;
  150. size_t const start = (SlabNb * SLAB_SIZE) + 1;
  151. uint32_t val32;
  152. memcpy(&val32, ptr+start, sizeof(val32));
  153. static const uint32_t prime32_5 = 374761393U;
  154. val32 += prime32_5;
  155. memcpy(ptr+start, &val32, sizeof(val32));
  156. }
  157. #else
  158. /*
  159. * Sparse sample generation.
  160. * This is the default pattern generator.
  161. * It only flips one bit at a time (mostly).
  162. * Low hamming distance scenario is more difficult for weak hash algorithms.
  163. * Note that CRC is immune to this scenario, since they are specifically
  164. * designed to detect low hamming distances.
  165. * Prefer the Slab5 pattern generator for collisions on CRC algorithms.
  166. */
  167. #define SPARSE_LEVEL_MAX 15
  168. /* Nb of combinations of m bits in a register of n bits */
  169. static double Cnm(int n, int m)
  170. {
  171. assert(n > 0);
  172. assert(m > 0);
  173. assert(m <= n);
  174. double acc = 1;
  175. for (int i=0; i<m; i++) {
  176. acc *= n - i;
  177. acc /= 1 + i;
  178. }
  179. return acc;
  180. }
  181. static int enoughCombos(size_t size, uint64_t htotal)
  182. {
  183. if (size < 2) return 0; /* ensure no multiplication by negative */
  184. uint64_t acc = 0;
  185. uint64_t const srcBits = size * 8; assert(srcBits < INT_MAX);
  186. int nbBitsSet = 0;
  187. while (acc < htotal) {
  188. nbBitsSet++;
  189. if (nbBitsSet >= SPARSE_LEVEL_MAX) return 0;
  190. acc += (uint64_t)Cnm((int)srcBits, nbBitsSet);
  191. }
  192. return 1;
  193. }
  194. typedef struct {
  195. void* buffer;
  196. size_t size;
  197. sf_genMode mode;
  198. /* sparse */
  199. size_t bitIdx[SPARSE_LEVEL_MAX];
  200. int level;
  201. size_t maxBitIdx;
  202. /* slab5 */
  203. size_t nbSlabs;
  204. size_t current;
  205. uint64_t prngSeed;
  206. } sampleFactory;
  207. static void init_sampleFactory(sampleFactory* sf, uint64_t htotal)
  208. {
  209. if (!enoughCombos(sf->size, htotal)) {
  210. EXIT("sample size must be larger for this amount of hashes");
  211. }
  212. memset(sf->bitIdx, 0, sizeof(sf->bitIdx));
  213. sf->level = 0;
  214. unsigned char* const p = (unsigned char*)sf->buffer;
  215. for (size_t n=0; n<sf->size; n++)
  216. p[n] = randomByte(sf->prngSeed + n);
  217. }
  218. static sampleFactory*
  219. create_sampleFactory(size_t size, uint64_t htotal, uint64_t seed)
  220. {
  221. sampleFactory* const sf = malloc(sizeof(sampleFactory));
  222. if (!sf) EXIT("not enough memory");
  223. void* const buffer = malloc(size);
  224. if (!buffer) EXIT("not enough memory");
  225. sf->buffer = buffer;
  226. sf->size = size;
  227. sf->mode = sf_sparse;
  228. sf->maxBitIdx = size * 8;
  229. sf->prngSeed = seed;
  230. init_sampleFactory(sf, htotal);
  231. return sf;
  232. }
  233. static void free_sampleFactory(sampleFactory* sf)
  234. {
  235. if (!sf) return;
  236. free(sf->buffer);
  237. free(sf);
  238. }
  239. static void flipbit(void* buffer, uint64_t bitID)
  240. {
  241. size_t const pos = (size_t)(bitID >> 3);
  242. unsigned char const mask = (unsigned char)(1 << (bitID & 7));
  243. unsigned char* const p = (unsigned char*)buffer;
  244. p[pos] ^= mask;
  245. }
  246. static int updateBit(void* buffer, size_t* bitIdx, int level, size_t max)
  247. {
  248. if (level==0) return 0; /* can't progress further */
  249. flipbit(buffer, bitIdx[level]); /* erase previous bits */
  250. if (bitIdx[level] < max-1) { /* simple case: go to next bit */
  251. bitIdx[level]++;
  252. flipbit(buffer, bitIdx[level]); /* set new bit */
  253. return 1;
  254. }
  255. /* reached last bit: need to update a bit from lower level */
  256. if (!updateBit(buffer, bitIdx, level-1, max-1)) return 0;
  257. bitIdx[level] = bitIdx[level-1] + 1;
  258. flipbit(buffer, bitIdx[level]); /* set new bit */
  259. return 1;
  260. }
  261. static inline void update_sampleFactory(sampleFactory* sf)
  262. {
  263. if (!updateBit(sf->buffer, sf->bitIdx, sf->level, sf->maxBitIdx)) {
  264. /* no more room => move to next level */
  265. sf->level++;
  266. assert(sf->level < SPARSE_LEVEL_MAX);
  267. /* set new bits */
  268. for (int i=1; i <= sf->level; i++) {
  269. sf->bitIdx[i] = (size_t)(i-1);
  270. flipbit(sf->buffer, sf->bitIdx[i]);
  271. }
  272. }
  273. }
  274. #endif /* pattern generator selection */
  275. /* === Candidate Filter === */
  276. typedef unsigned char Filter;
  277. Filter* create_Filter(int bflog)
  278. {
  279. assert(bflog < 64 && bflog > 1);
  280. size_t bfsize = (size_t)1 << bflog;
  281. Filter* bf = malloc(bfsize);
  282. assert(((void)"Filter creation failed", bf));
  283. memset(bf, 0, bfsize);
  284. return bf;
  285. }
  286. void free_Filter(Filter* bf)
  287. {
  288. free(bf);
  289. }
  290. #ifdef FILTER_1_PROBE
  291. /*
  292. * Attach hash to a slot
  293. * return: Nb of potential collision candidates detected
  294. * 0: position not yet occupied
  295. * 2: position previously occupied by a single candidate
  296. * 1: position already occupied by multiple candidates
  297. */
  298. inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
  299. {
  300. int const slotNb = hash & 3;
  301. int const shift = slotNb * 2 ;
  302. size_t const bfmask = ((size_t)1 << bflog) - 1;
  303. size_t const pos = (hash >> 2) & bfmask;
  304. int const existingCandidates = ((((unsigned char*)bf)[pos]) >> shift) & 3;
  305. static const int addCandidates[4] = { 0, 2, 1, 1 };
  306. static const int nextValue[4] = { 1, 2, 3, 3 };
  307. ((unsigned char*)bf)[pos] |= (unsigned char)(nextValue[existingCandidates] << shift);
  308. return addCandidates[existingCandidates];
  309. }
  310. /*
  311. * Check if provided 64-bit hash is a collision candidate
  312. * Requires the slot to be occupied by at least 2 candidates.
  313. * return >0 if hash is a collision candidate
  314. * 0 otherwise (slot unoccupied, or only one candidate)
  315. * note: unoccupied slots should not happen in this algorithm,
  316. * since all hashes are supposed to have been inserted at least once.
  317. */
  318. inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
  319. {
  320. int const slotNb = hash & 3;
  321. int const shift = slotNb * 2;
  322. size_t const bfmask = ((size_t)1 << bflog) - 1;
  323. size_t const pos = (hash >> 2) & bfmask;
  324. return (((const unsigned char*)bf)[pos]) >> (shift+1) & 1;
  325. }
  326. #else
  327. /*
  328. * 2-probes strategy,
  329. * more efficient at filtering candidates,
  330. * requires filter size to be > nb of hashes
  331. */
  332. #define MIN(a,b) ((a) < (b) ? (a) : (b))
  333. #define MAX(a,b) ((a) > (b) ? (a) : (b))
  334. /*
  335. * Attach hash to 2 slots
  336. * return: Nb of potential candidates detected
  337. * 0: position not yet occupied
  338. * 2: position previously occupied by a single candidate (at most)
  339. * 1: position already occupied by multiple candidates
  340. */
  341. static inline int Filter_insert(Filter* bf, int bflog, uint64_t hash)
  342. {
  343. hash = avalanche64(hash);
  344. unsigned const slot1 = hash & 255;
  345. hash >>= 8;
  346. unsigned const slot2 = hash & 255;
  347. hash >>= 8;
  348. size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
  349. size_t const cacheLineNb = (size_t)hash & fclmask;
  350. size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
  351. unsigned const shift1 = (slot1 & 3) * 2;
  352. unsigned const ex1 = (bf[pos1] >> shift1) & 3;
  353. size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
  354. unsigned const shift2 = (slot2 & 3) * 2;
  355. unsigned const ex2 = (bf[pos2] >> shift2) & 3;
  356. unsigned const existing = MIN(ex1, ex2);
  357. static const int addCandidates[4] = { 0, 2, 1, 1 };
  358. static const unsigned nextValue[4] = { 1, 2, 3, 3 };
  359. bf[pos1] &= (Filter)(~(3 << shift1)); /* erase previous value */
  360. unsigned const max1 = MAX(ex1, nextValue[existing]);
  361. bf[pos1] |= (Filter)(max1 << shift1);
  362. unsigned const max2 = MAX(ex2, nextValue[existing]);
  363. bf[pos2] |= (Filter)(max2 << shift2);
  364. return addCandidates[existing];
  365. }
  366. /*
  367. * Check if provided 64-bit hash is a collision candidate
  368. * Requires the slot to be occupied by at least 2 candidates.
  369. * return >0 if hash is a collision candidate
  370. * 0 otherwise (slot unoccupied, or only one candidate)
  371. * note: unoccupied slots should not happen in this algorithm,
  372. * since all hashes are supposed to have been inserted at least once.
  373. */
  374. static inline int Filter_check(const Filter* bf, int bflog, uint64_t hash)
  375. {
  376. hash = avalanche64(hash);
  377. unsigned const slot1 = hash & 255;
  378. hash >>= 8;
  379. unsigned const slot2 = hash & 255;
  380. hash >>= 8;
  381. size_t const fclmask = ((size_t)1 << (bflog-6)) - 1;
  382. size_t const cacheLineNb = (size_t)hash & fclmask;
  383. size_t const pos1 = (cacheLineNb << 6) + (slot1 >> 2);
  384. unsigned const shift1 = (slot1 & 3) * 2;
  385. unsigned const ex1 = (bf[pos1] >> shift1) & 3;
  386. size_t const pos2 = (cacheLineNb << 6) + (slot2 >> 2);
  387. unsigned const shift2 = (slot2 & 3) * 2;
  388. unsigned const ex2 = (bf[pos2] >> shift2) & 3;
  389. return (ex1 >= 2) && (ex2 >= 2);
  390. }
  391. #endif // FILTER_1_PROBE
  392. /* === Display === */
  393. #include <time.h> /* clock_t, clock, time_t, time, difftime */
  394. void update_indicator(uint64_t v, uint64_t total)
  395. {
  396. static clock_t start = 0;
  397. if (start==0) start = clock();
  398. clock_t const updateRate = CLOCKS_PER_SEC / 2;
  399. clock_t const clockSpan = (clock_t)(clock() - start);
  400. if (clockSpan > updateRate) {
  401. start = clock();
  402. assert(v <= total);
  403. assert(total > 0);
  404. double share = ((double)v / (double)total) * 100;
  405. printf("%6.2f%% (%llu) \r", share, (unsigned long long)v);
  406. fflush(NULL);
  407. }
  408. }
  409. /* note: not thread safe */
  410. const char* displayDelay(double delay_s)
  411. {
  412. static char delayString[50];
  413. memset(delayString, 0, sizeof(delayString));
  414. int const mn = ((int)delay_s / 60) % 60;
  415. int const h = (int)delay_s / 3600;
  416. int const sec = (int)delay_s % 60;
  417. char* p = delayString;
  418. if (h) sprintf(p, "%i h ", h);
  419. if (mn || h) {
  420. p = delayString + strlen(delayString);
  421. sprintf(p, "%i mn ", mn);
  422. }
  423. p = delayString + strlen(delayString);
  424. sprintf(p, "%is ", sec);
  425. return delayString;
  426. }
  427. /* === Math === */
  428. static double power(uint64_t base, int p)
  429. {
  430. double value = 1;
  431. assert(p>=0);
  432. for (int i=0; i<p; i++) {
  433. value *= (double)base;
  434. }
  435. return value;
  436. }
  437. static double estimateNbCollisions(uint64_t nbH, int nbBits)
  438. {
  439. return ((double)nbH * (double)(nbH-1)) / power(2, nbBits+1);
  440. }
  441. static int highestBitSet(uint64_t v)
  442. {
  443. assert(v!=0);
  444. int bitId = 0;
  445. while (v >>= 1) bitId++;
  446. return bitId;
  447. }
  448. /* === Filter and search collisions === */
  449. #undef NDEBUG /* ensure assert is not disabled */
  450. #include <assert.h>
  451. /* will recommend 24 billion samples for 64-bit hashes,
  452. * expecting 18 collisions for a good 64-bit hash */
  453. #define NB_BITS_MAX 64 /* can't store nor analyze hash wider than 64-bits for the time being */
  454. uint64_t select_nbh(int nbBits)
  455. {
  456. assert(nbBits > 0);
  457. if (nbBits > NB_BITS_MAX) nbBits = NB_BITS_MAX;
  458. double targetColls = (double)((128 + 17) - (nbBits * 2));
  459. uint64_t nbH = 24;
  460. while (estimateNbCollisions(nbH, nbBits) < targetColls) nbH *= 2;
  461. return nbH;
  462. }
  463. typedef struct {
  464. uint64_t nbCollisions;
  465. } searchCollisions_results;
  466. typedef struct {
  467. uint64_t nbH;
  468. uint64_t mask;
  469. uint64_t maskSelector;
  470. size_t sampleSize;
  471. uint64_t prngSeed;
  472. int filterLog; /* <0 = disable filter; 0 = auto-size; */
  473. int hashID;
  474. int display;
  475. int nbThreads;
  476. searchCollisions_results* resultPtr;
  477. } searchCollisions_parameters;
  478. #define DISPLAY(...) { if (display) printf(__VA_ARGS__); }
  479. static int isEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype)
  480. {
  481. if ((htype == ht64) || (htype == ht32)) {
  482. uint64_t const h1 = ((const uint64_t*)hTablePtr)[index1];
  483. uint64_t const h2 = ((const uint64_t*)hTablePtr)[index2];
  484. return (h1 == h2);
  485. } else {
  486. assert(htype == ht128);
  487. XXH128_hash_t const h1 = ((const XXH128_hash_t*)hTablePtr)[index1];
  488. XXH128_hash_t const h2 = ((const XXH128_hash_t*)hTablePtr)[index2];
  489. return XXH128_isEqual(h1, h2);
  490. }
  491. }
  492. static int isHighEqual(void* hTablePtr, size_t index1, size_t index2, Htype_e htype, int rShift)
  493. {
  494. uint64_t h1, h2;
  495. if ((htype == ht64) || (htype == ht32)) {
  496. h1 = ((const uint64_t*)hTablePtr)[index1];
  497. h2 = ((const uint64_t*)hTablePtr)[index2];
  498. } else {
  499. assert(htype == ht128);
  500. h1 = ((const XXH128_hash_t*)hTablePtr)[index1].high64;
  501. h2 = ((const XXH128_hash_t*)hTablePtr)[index2].high64;
  502. assert(rShift >= 64);
  503. rShift -= 64;
  504. }
  505. assert(0 <= rShift && rShift < 64);
  506. return (h1 >> rShift) == (h2 >> rShift);
  507. }
  508. /* assumption: (htype*)hTablePtr[index] is valid */
  509. static void addHashCandidate(void* hTablePtr, UniHash h, Htype_e htype, size_t index)
  510. {
  511. if ((htype == ht64) || (htype == ht32)) {
  512. ((uint64_t*)hTablePtr)[index] = h.h64;
  513. } else {
  514. assert(htype == ht128);
  515. ((XXH128_hash_t*)hTablePtr)[index] = h.h128;
  516. }
  517. }
  518. static int getNbBits_fromHtype(Htype_e htype) {
  519. switch(htype) {
  520. case ht32: return 32;
  521. case ht64: return 64;
  522. case ht128:return 128;
  523. default: EXIT("hash size not supported");
  524. }
  525. }
  526. static Htype_e getHtype_fromHbits(int nbBits) {
  527. switch(nbBits) {
  528. case 32 : return ht32;
  529. case 64 : return ht64;
  530. case 128: return ht128;
  531. default: EXIT("hash size not supported");
  532. }
  533. }
  534. static size_t search_collisions(
  535. searchCollisions_parameters param)
  536. {
  537. uint64_t totalH = param.nbH;
  538. const uint64_t hMask = param.mask;
  539. const uint64_t hSelector = param.maskSelector;
  540. int bflog = param.filterLog;
  541. const int filter = (param.filterLog >= 0);
  542. const size_t sampleSize = param.sampleSize;
  543. const int hashID = param.hashID;
  544. const Htype_e htype = getHtype_fromHbits(hashfnTable[hashID].bits);
  545. const int display = param.display;
  546. /* init */
  547. sampleFactory* const sf = create_sampleFactory(sampleSize, totalH, param.prngSeed);
  548. if (!sf) EXIT("not enough memory");
  549. //const char* const hname = hashfnTable[hashID].name;
  550. hashfn const hfunction = hashfnTable[hashID].fn;
  551. int const hwidth = hashfnTable[hashID].bits;
  552. if (totalH == 0) totalH = select_nbh(hwidth);
  553. if (bflog == 0) bflog = highestBitSet(totalH) + 1; /* auto-size filter */
  554. uint64_t const bfsize = (1ULL << bflog);
  555. /* === filter hashes (optional) === */
  556. Filter* bf = NULL;
  557. uint64_t nbPresents = totalH;
  558. if (filter) {
  559. time_t const filterTBegin = time(NULL);
  560. DISPLAY(" Creating filter (%i GB) \n", (int)(bfsize >> 30));
  561. bf = create_Filter(bflog);
  562. if (!bf) EXIT("not enough memory for filter");
  563. DISPLAY(" Generate %llu hashes from samples of %u bytes \n",
  564. (unsigned long long)totalH, (unsigned)sampleSize);
  565. nbPresents = 0;
  566. for (uint64_t n=0; n < totalH; n++) {
  567. if (display && ((n&0xFFFFF) == 1) )
  568. update_indicator(n, totalH);
  569. update_sampleFactory(sf);
  570. UniHash const h = hfunction(sf->buffer, sampleSize);
  571. if ((h.h64 & hMask) != hSelector) continue;
  572. nbPresents += (uint64_t)Filter_insert(bf, bflog, h.h64);
  573. }
  574. if (nbPresents==0) {
  575. DISPLAY(" Analysis completed: No collision detected \n");
  576. if (param.resultPtr) param.resultPtr->nbCollisions = 0;
  577. free_Filter(bf);
  578. free_sampleFactory(sf);
  579. return 0;
  580. }
  581. { double const filterDelay = difftime(time(NULL), filterTBegin);
  582. DISPLAY(" Generation and filter completed in %s, detected up to %llu candidates \n",
  583. displayDelay(filterDelay), (unsigned long long) nbPresents);
  584. } }
  585. /* === store hash candidates: duplicates will be present here === */
  586. time_t const storeTBegin = time(NULL);
  587. size_t const hashByteSize = (htype == ht128) ? 16 : 8;
  588. size_t const tableSize = (size_t)((nbPresents+1) * hashByteSize);
  589. assert(tableSize > nbPresents); /* check tableSize calculation overflow */
  590. DISPLAY(" Storing hash candidates (%i MB) \n", (int)(tableSize >> 20));
  591. /* Generate and store hashes */
  592. void* const hashCandidates = malloc(tableSize);
  593. if (!hashCandidates) EXIT("not enough memory to store candidates");
  594. init_sampleFactory(sf, totalH);
  595. size_t nbCandidates = 0;
  596. for (uint64_t n=0; n < totalH; n++) {
  597. if (display && ((n&0xFFFFF) == 1) ) update_indicator(n, totalH);
  598. update_sampleFactory(sf);
  599. UniHash const h = hfunction(sf->buffer, sampleSize);
  600. if ((h.h64 & hMask) != hSelector) continue;
  601. if (filter) {
  602. if (Filter_check(bf, bflog, h.h64)) {
  603. assert(nbCandidates < nbPresents);
  604. addHashCandidate(hashCandidates, h, htype, nbCandidates++);
  605. }
  606. } else {
  607. assert(nbCandidates < nbPresents);
  608. addHashCandidate(hashCandidates, h, htype, nbCandidates++);
  609. }
  610. }
  611. if (nbCandidates < nbPresents) {
  612. /* Try to mitigate gnuc_quicksort behavior, by reducing allocated memory,
  613. * since gnuc_quicksort uses a lot of additional memory for mergesort */
  614. void* const checkPtr = realloc(hashCandidates, nbCandidates * hashByteSize);
  615. assert(checkPtr != NULL);
  616. assert(checkPtr == hashCandidates); /* simplification: since we are reducing the size,
  617. * we hope to keep the same ptr position.
  618. * Otherwise, hashCandidates must be mutable. */
  619. DISPLAY(" List of hashes reduced to %u MB from %u MB (saved %u MB) \n",
  620. (unsigned)((nbCandidates * hashByteSize) >> 20),
  621. (unsigned)(tableSize >> 20),
  622. (unsigned)((tableSize - (nbCandidates * hashByteSize)) >> 20) );
  623. }
  624. double const storeTDelay = difftime(time(NULL), storeTBegin);
  625. DISPLAY(" Stored %llu hash candidates in %s \n",
  626. (unsigned long long) nbCandidates, displayDelay(storeTDelay));
  627. free_Filter(bf);
  628. free_sampleFactory(sf);
  629. /* === step 3: look for duplicates === */
  630. time_t const sortTBegin = time(NULL);
  631. DISPLAY(" Sorting candidates... ");
  632. fflush(NULL);
  633. if ((htype == ht64) || (htype == ht32)) {
  634. /*
  635. * Use C++'s std::sort, as it's faster than C stdlib's qsort, and
  636. * doesn't suffer from gnuc_libsort's memory expansion
  637. */
  638. sort64(hashCandidates, nbCandidates);
  639. } else {
  640. assert(htype == ht128);
  641. sort128(hashCandidates, nbCandidates); /* sort with custom comparator */
  642. }
  643. double const sortTDelay = difftime(time(NULL), sortTBegin);
  644. DISPLAY(" Completed in %s \n", displayDelay(sortTDelay));
  645. /* scan and count duplicates */
  646. time_t const countBegin = time(NULL);
  647. DISPLAY(" Looking for duplicates: ");
  648. fflush(NULL);
  649. size_t collisions = 0;
  650. for (size_t n=1; n<nbCandidates; n++) {
  651. if (isEqual(hashCandidates, n, n-1, htype)) {
  652. #if defined(COL_DISPLAY_DUPLICATES)
  653. printf("collision: ");
  654. printHash(hashCandidates, n, htype);
  655. printf(" / ");
  656. printHash(hashCandidates, n-1, htype);
  657. printf(" \n");
  658. #endif
  659. collisions++;
  660. } }
  661. if (!filter /* all candidates */ && display /*single thead*/ ) {
  662. /* check partial bitfields (high bits) */
  663. DISPLAY(" \n");
  664. int const hashBits = getNbBits_fromHtype(htype);
  665. double worstRatio = 0.;
  666. int worstNbHBits = 0;
  667. for (int nbHBits = 1; nbHBits < hashBits; nbHBits++) {
  668. uint64_t const nbSlots = (uint64_t)1 << nbHBits;
  669. double const expectedCollisions = estimateNbCollisions(nbCandidates, nbHBits);
  670. if ( (nbSlots > nbCandidates * 100) /* within range for meaningful collision analysis results */
  671. && (expectedCollisions > 18.0) ) {
  672. int const rShift = hashBits - nbHBits;
  673. size_t HBits_collisions = 0;
  674. for (size_t n=1; n<nbCandidates; n++) {
  675. if (isHighEqual(hashCandidates, n, n-1, htype, rShift)) {
  676. HBits_collisions++;
  677. } }
  678. double const collisionRatio = (double)HBits_collisions / expectedCollisions;
  679. if (collisionRatio > 2.0) DISPLAY("WARNING !!! ===> ");
  680. DISPLAY(" high %i bits: %zu collision (%.1f expected): x%.2f \n",
  681. nbHBits, HBits_collisions, expectedCollisions, collisionRatio);
  682. if (collisionRatio > worstRatio) {
  683. worstNbHBits = nbHBits;
  684. worstRatio = collisionRatio;
  685. } } }
  686. DISPLAY("Worst collision ratio at %i high bits: x%.2f \n",
  687. worstNbHBits, worstRatio);
  688. }
  689. double const countDelay = difftime(time(NULL), countBegin);
  690. DISPLAY(" Completed in %s \n", displayDelay(countDelay));
  691. /* clean and exit */
  692. free (hashCandidates);
  693. #if 0 /* debug */
  694. for (size_t n=0; n<nbCandidates; n++)
  695. printf("0x%016llx \n", (unsigned long long)hashCandidates[n]);
  696. #endif
  697. if (param.resultPtr) param.resultPtr->nbCollisions = collisions;
  698. return collisions;
  699. }
  700. #if defined(__MACH__) || defined(__linux__)
  701. #include <sys/resource.h>
  702. static size_t getProcessMemUsage(int children)
  703. {
  704. struct rusage stats;
  705. if (getrusage(children ? RUSAGE_CHILDREN : RUSAGE_SELF, &stats) == 0)
  706. return (size_t)stats.ru_maxrss;
  707. return 0;
  708. }
  709. #else
  710. static size_t getProcessMemUsage(int ignore) { (void)ignore; return 0; }
  711. #endif
  712. void time_collisions(searchCollisions_parameters param)
  713. {
  714. uint64_t totalH = param.nbH;
  715. int hashID = param.hashID;
  716. int display = param.display;
  717. /* init */
  718. assert(0 <= hashID && hashID < HASH_FN_TOTAL);
  719. //const char* const hname = hashfnTable[hashID].name;
  720. int const hwidth = hashfnTable[hashID].bits;
  721. if (totalH == 0) totalH = select_nbh(hwidth);
  722. double const targetColls = estimateNbCollisions(totalH, hwidth);
  723. /* Start the timer to measure start/end of hashing + collision detection. */
  724. time_t const programTBegin = time(NULL);
  725. /* Generate hashes, and count collisions */
  726. size_t const collisions = search_collisions(param);
  727. /* display results */
  728. double const programTDelay = difftime(time(NULL), programTBegin);
  729. size_t const programBytesSelf = getProcessMemUsage(0);
  730. size_t const programBytesChildren = getProcessMemUsage(1);
  731. DISPLAY("\n\n");
  732. DISPLAY("===> Found %llu collisions (x%.2f, %.1f expected) in %s\n",
  733. (unsigned long long)collisions,
  734. (double)collisions / targetColls,
  735. targetColls,
  736. displayDelay(programTDelay));
  737. if (programBytesSelf)
  738. DISPLAY("===> MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
  739. programBytesSelf>>20,
  740. programBytesChildren>>20);
  741. DISPLAY("------------------------------------------ \n");
  742. }
  743. // wrapper for pthread interface
  744. void MT_searchCollisions(void* payload)
  745. {
  746. search_collisions(*(searchCollisions_parameters*)payload);
  747. }
  748. /* === Command Line === */
  749. /*!
  750. * readU64FromChar():
  751. * Allows and interprets K, KB, KiB, M, MB and MiB suffix.
  752. * Will also modify `*stringPtr`, advancing it to the position where it stopped reading.
  753. */
  754. static uint64_t readU64FromChar(const char** stringPtr)
  755. {
  756. static uint64_t const max = (((uint64_t)(-1)) / 10) - 1;
  757. uint64_t result = 0;
  758. while ((**stringPtr >='0') && (**stringPtr <='9')) {
  759. assert(result < max);
  760. result *= 10;
  761. result += (unsigned)(**stringPtr - '0');
  762. (*stringPtr)++ ;
  763. }
  764. if ((**stringPtr=='K') || (**stringPtr=='M') || (**stringPtr=='G')) {
  765. uint64_t const maxK = ((uint64_t)(-1)) >> 10;
  766. assert(result < maxK);
  767. result <<= 10;
  768. if ((**stringPtr=='M') || (**stringPtr=='G')) {
  769. assert(result < maxK);
  770. result <<= 10;
  771. if (**stringPtr=='G') {
  772. assert(result < maxK);
  773. result <<= 10;
  774. }
  775. }
  776. (*stringPtr)++; /* skip `K` or `M` */
  777. if (**stringPtr=='i') (*stringPtr)++;
  778. if (**stringPtr=='B') (*stringPtr)++;
  779. }
  780. return result;
  781. }
  782. /**
  783. * longCommandWArg():
  784. * Checks if *stringPtr is the same as longCommand.
  785. * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
  786. * @return 0 and doesn't modify *stringPtr otherwise.
  787. */
  788. static int longCommandWArg(const char** stringPtr, const char* longCommand)
  789. {
  790. assert(longCommand); assert(stringPtr); assert(*stringPtr);
  791. size_t const comSize = strlen(longCommand);
  792. int const result = !strncmp(*stringPtr, longCommand, comSize);
  793. if (result) *stringPtr += comSize;
  794. return result;
  795. }
  796. #include "pool.h"
  797. /*
  798. * As some hashes use different algorithms depending on input size,
  799. * it can be necessary to test multiple input sizes
  800. * to paint an accurate picture of collision performance
  801. */
  802. #define SAMPLE_SIZE_DEFAULT 256
  803. #define HASHFN_ID_DEFAULT 0
  804. void help(const char* exeName)
  805. {
  806. printf("usage: %s [hashName] [opt] \n\n", exeName);
  807. printf("list of hashNames:");
  808. printf("%s ", hashfnTable[0].name);
  809. for (int i=1; i < HASH_FN_TOTAL; i++) {
  810. printf(", %s ", hashfnTable[i].name);
  811. }
  812. printf(" \n");
  813. printf("Default hashName is %s\n", hashfnTable[HASHFN_ID_DEFAULT].name);
  814. printf(" \n");
  815. printf("Optional parameters: \n");
  816. printf(" --nbh=NB Select nb of hashes to generate (%llu by default) \n", (unsigned long long)select_nbh(64));
  817. printf(" --filter Activates the filter. Slower, but reduces memory usage for the same nb of hashes.\n");
  818. printf(" --threadlog=NB Use 2^NB threads.\n");
  819. printf(" --len=MB Set length of the input (%i bytes by default) \n", SAMPLE_SIZE_DEFAULT);
  820. }
  821. int bad_argument(const char* exeName)
  822. {
  823. printf("incorrect command: \n");
  824. help(exeName);
  825. return 1;
  826. }
  827. int main(int argc, const char** argv)
  828. {
  829. if (sizeof(size_t) < 8) return 1; // cannot work on systems without ability to allocate objects >= 4 GB
  830. assert(argc > 0);
  831. const char* const exeName = argv[0];
  832. uint64_t totalH = 0; /* auto, based on nbBits */
  833. int bflog = 0; /* auto */
  834. int filter = 0; /* disabled */
  835. size_t sampleSize = SAMPLE_SIZE_DEFAULT;
  836. int hashID = HASHFN_ID_DEFAULT;
  837. int threadlog = 0;
  838. uint64_t prngSeed = 0;
  839. int arg_nb;
  840. for (arg_nb = 1; arg_nb < argc; arg_nb++) {
  841. const char** arg = argv + arg_nb;
  842. if (!strcmp(*arg, "-h")) { help(exeName); return 0; }
  843. if (longCommandWArg(arg, "-T")) { threadlog = (int)readU64FromChar(arg); continue; }
  844. if (!strcmp(*arg, "--filter")) { filter=1; continue; }
  845. if (!strcmp(*arg, "--no-filter")) { filter=0; continue; }
  846. if (longCommandWArg(arg, "--seed")) { prngSeed = readU64FromChar(arg); continue; }
  847. if (longCommandWArg(arg, "--nbh=")) { totalH = readU64FromChar(arg); continue; }
  848. if (longCommandWArg(arg, "--filter=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
  849. if (longCommandWArg(arg, "--filterlog=")) { filter=1; bflog = (int)readU64FromChar(arg); assert(bflog < 64); continue; }
  850. if (longCommandWArg(arg, "--size=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
  851. if (longCommandWArg(arg, "--len=")) { sampleSize = (size_t)readU64FromChar(arg); continue; }
  852. if (longCommandWArg(arg, "--threadlog=")) { threadlog = (int)readU64FromChar(arg); continue; }
  853. /* argument understood as hash name (must be correct) */
  854. int hnb;
  855. for (hnb=0; hnb < HASH_FN_TOTAL; hnb++) {
  856. if (!strcmp(*arg, hashfnTable[hnb].name)) { hashID = hnb; break; }
  857. }
  858. if (hnb == HASH_FN_TOTAL) return bad_argument(exeName);
  859. }
  860. /* init */
  861. const char* const hname = hashfnTable[hashID].name;
  862. int const hwidth = hashfnTable[hashID].bits;
  863. if (totalH == 0) totalH = select_nbh(hwidth);
  864. double const targetColls = estimateNbCollisions(totalH, hwidth);
  865. if (bflog == 0) bflog = highestBitSet(totalH) + 1; /* auto-size filter */
  866. if (!filter) bflog = -1; // disable filter
  867. if (sizeof(size_t) < 8)
  868. EXIT("This program has not been validated on architectures other than "
  869. "64bit \n");
  870. printf(" *** Collision tester for 64+ bit hashes *** \n\n");
  871. printf("Testing %s algorithm (%i-bit) \n", hname, hwidth);
  872. printf("This program will allocate a lot of memory,\n");
  873. printf("generate %llu %i-bit hashes from samples of %u bytes, \n",
  874. (unsigned long long)totalH, hwidth, (unsigned)sampleSize);
  875. printf("and attempt to produce %.0f collisions. \n\n", targetColls);
  876. int const nbThreads = 1 << threadlog;
  877. if (nbThreads <= 0) EXIT("Invalid --threadlog value.");
  878. if (nbThreads == 1) {
  879. searchCollisions_parameters params;
  880. params.nbH = totalH;
  881. params.mask = 0;
  882. params.maskSelector = 0;
  883. params.sampleSize = sampleSize;
  884. params.filterLog = bflog;
  885. params.hashID = hashID;
  886. params.display = 1;
  887. params.resultPtr = NULL;
  888. params.prngSeed = prngSeed;
  889. params.nbThreads = 1;
  890. time_collisions(params);
  891. } else { /* nbThreads > 1 */
  892. /* use multithreading */
  893. if (threadlog >= 30) EXIT("too many threads requested");
  894. if ((uint64_t)nbThreads > (totalH >> 16))
  895. EXIT("too many threads requested");
  896. if (bflog > 0 && threadlog > (bflog-10))
  897. EXIT("too many threads requested");
  898. printf("using %i threads ... \n", nbThreads);
  899. /* allocation */
  900. time_t const programTBegin = time(NULL);
  901. POOL_ctx* const pt = POOL_create((size_t)nbThreads, 1);
  902. if (!pt) EXIT("not enough memory for threads");
  903. searchCollisions_results* const MTresults = calloc (sizeof(searchCollisions_results), (size_t)nbThreads);
  904. if (!MTresults) EXIT("not enough memory");
  905. searchCollisions_parameters* const MTparams = calloc (sizeof(searchCollisions_parameters), (size_t)nbThreads);
  906. if (!MTparams) EXIT("not enough memory");
  907. /* distribute jobs */
  908. for (int tnb=0; tnb<nbThreads; tnb++) {
  909. MTparams[tnb].nbH = totalH;
  910. MTparams[tnb].mask = (uint64_t)nbThreads - 1;
  911. MTparams[tnb].sampleSize = sampleSize;
  912. MTparams[tnb].filterLog = bflog ? bflog - threadlog : 0;
  913. MTparams[tnb].hashID = hashID;
  914. MTparams[tnb].display = 0;
  915. MTparams[tnb].resultPtr = MTresults+tnb;
  916. MTparams[tnb].prngSeed = prngSeed;
  917. MTparams[tnb].maskSelector = (uint64_t)tnb;
  918. POOL_add(pt, MT_searchCollisions, MTparams + tnb);
  919. }
  920. POOL_free(pt); /* actually joins and free */
  921. /* Gather results */
  922. uint64_t nbCollisions=0;
  923. for (int tnb=0; tnb<nbThreads; tnb++) {
  924. nbCollisions += MTresults[tnb].nbCollisions;
  925. }
  926. double const programTDelay = difftime(time(NULL), programTBegin);
  927. size_t const programBytesSelf = getProcessMemUsage(0);
  928. size_t const programBytesChildren = getProcessMemUsage(1);
  929. printf("\n\n");
  930. printf("===> Found %llu collisions (x%.2f, %.1f expected) in %s\n",
  931. (unsigned long long)nbCollisions,
  932. (double)nbCollisions / targetColls,
  933. targetColls,
  934. displayDelay(programTDelay));
  935. if (programBytesSelf)
  936. printf("===> MaxRSS(self) %zuMB, MaxRSS(children) %zuMB\n",
  937. programBytesSelf>>20,
  938. programBytesChildren>>20);
  939. printf("------------------------------------------ \n");
  940. /* Clean up */
  941. free(MTparams);
  942. free(MTresults);
  943. }
  944. return 0;
  945. }