hashjoin.h 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. /*-------------------------------------------------------------------------
  2. *
  3. * hashjoin.h
  4. * internal structures for hash joins
  5. *
  6. *
  7. * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * src/include/executor/hashjoin.h
  11. *
  12. *-------------------------------------------------------------------------
  13. */
  14. #ifndef HASHJOIN_H
  15. #define HASHJOIN_H
  16. #include "nodes/execnodes.h"
  17. #include "storage/buffile.h"
  18. /* ----------------------------------------------------------------
  19. * hash-join hash table structures
  20. *
  21. * Each active hashjoin has a HashJoinTable control block, which is
  22. * palloc'd in the executor's per-query context. All other storage needed
  23. * for the hashjoin is kept in private memory contexts, two for each hashjoin.
  24. * This makes it easy and fast to release the storage when we don't need it
  25. * anymore. (Exception: data associated with the temp files lives in the
  26. * per-query context too, since we always call buffile.c in that context.)
  27. *
  28. * The hashtable contexts are made children of the per-query context, ensuring
  29. * that they will be discarded at end of statement even if the join is
  30. * aborted early by an error. (Likewise, any temporary files we make will
  31. * be cleaned up by the virtual file manager in event of an error.)
  32. *
  33. * Storage that should live through the entire join is allocated from the
  34. * "hashCxt", while storage that is only wanted for the current batch is
  35. * allocated in the "batchCxt". By resetting the batchCxt at the end of
  36. * each batch, we free all the per-batch storage reliably and without tedium.
  37. *
  38. * During first scan of inner relation, we get its tuples from executor.
  39. * If nbatch > 1 then tuples that don't belong in first batch get saved
  40. * into inner-batch temp files. The same statements apply for the
  41. * first scan of the outer relation, except we write tuples to outer-batch
  42. * temp files. After finishing the first scan, we do the following for
  43. * each remaining batch:
  44. * 1. Read tuples from inner batch file, load into hash buckets.
  45. * 2. Read tuples from outer batch file, match to hash buckets and output.
  46. *
  47. * It is possible to increase nbatch on the fly if the in-memory hash table
  48. * gets too big. The hash-value-to-batch computation is arranged so that this
  49. * can only cause a tuple to go into a later batch than previously thought,
  50. * never into an earlier batch. When we increase nbatch, we rescan the hash
  51. * table and dump out any tuples that are now of a later batch to the correct
  52. * inner batch file. Subsequently, while reading either inner or outer batch
  53. * files, we might find tuples that no longer belong to the current batch;
  54. * if so, we just dump them out to the correct batch file.
  55. * ----------------------------------------------------------------
  56. */
  57. /* these are in nodes/execnodes.h: */
  58. /* typedef struct HashJoinTupleData *HashJoinTuple; */
  59. /* typedef struct HashJoinTableData *HashJoinTable; */
  60. typedef struct HashJoinTupleData
  61. {
  62. struct HashJoinTupleData *next; /* link to next tuple in same bucket */
  63. uint32 hashvalue; /* tuple's hash code */
  64. /* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
  65. } HashJoinTupleData;
  66. #define HJTUPLE_OVERHEAD MAXALIGN(sizeof(HashJoinTupleData))
  67. #define HJTUPLE_MINTUPLE(hjtup) \
  68. ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
  69. /*
  70. * If the outer relation's distribution is sufficiently nonuniform, we attempt
  71. * to optimize the join by treating the hash values corresponding to the outer
  72. * relation's MCVs specially. Inner relation tuples matching these hash
  73. * values go into the "skew" hashtable instead of the main hashtable, and
  74. * outer relation tuples with these hash values are matched against that
  75. * table instead of the main one. Thus, tuples with these hash values are
  76. * effectively handled as part of the first batch and will never go to disk.
  77. * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
  78. * allowed for the join; while building the hashtables, we decrease the number
  79. * of MCVs being specially treated if needed to stay under this limit.
  80. *
  81. * Note: you might wonder why we look at the outer relation stats for this,
  82. * rather than the inner. One reason is that the outer relation is typically
  83. * bigger, so we get more I/O savings by optimizing for its most common values.
  84. * Also, for similarly-sized relations, the planner prefers to put the more
  85. * uniformly distributed relation on the inside, so we're more likely to find
  86. * interesting skew in the outer relation.
  87. */
  88. typedef struct HashSkewBucket
  89. {
  90. uint32 hashvalue; /* common hash value */
  91. HashJoinTuple tuples; /* linked list of inner-relation tuples */
  92. } HashSkewBucket;
  93. #define SKEW_BUCKET_OVERHEAD MAXALIGN(sizeof(HashSkewBucket))
  94. #define INVALID_SKEW_BUCKET_NO (-1)
  95. #define SKEW_WORK_MEM_PERCENT 2
  96. #define SKEW_MIN_OUTER_FRACTION 0.01
  97. /*
  98. * To reduce palloc overhead, the HashJoinTuples for the current batch are
  99. * packed in 32kB buffers instead of pallocing each tuple individually.
  100. */
  101. typedef struct HashMemoryChunkData
  102. {
  103. int ntuples; /* number of tuples stored in this chunk */
  104. size_t maxlen; /* size of the buffer holding the tuples */
  105. size_t used; /* number of buffer bytes already used */
  106. struct HashMemoryChunkData *next; /* pointer to the next chunk (linked
  107. * list) */
  108. char data[FLEXIBLE_ARRAY_MEMBER]; /* buffer allocated at the end */
  109. } HashMemoryChunkData;
  110. typedef struct HashMemoryChunkData *HashMemoryChunk;
  111. #define HASH_CHUNK_SIZE (32 * 1024L)
  112. #define HASH_CHUNK_THRESHOLD (HASH_CHUNK_SIZE / 4)
  113. typedef struct HashJoinTableData
  114. {
  115. int nbuckets; /* # buckets in the in-memory hash table */
  116. int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
  117. int nbuckets_original; /* # buckets when starting the first
  118. * hash */
  119. int nbuckets_optimal; /* optimal # buckets (per batch) */
  120. int log2_nbuckets_optimal; /* log2(nbuckets_optimal) */
  121. /* buckets[i] is head of list of tuples in i'th in-memory bucket */
  122. struct HashJoinTupleData **buckets;
  123. /* buckets array is per-batch storage, as are all the tuples */
  124. bool keepNulls; /* true to store unmatchable NULL tuples */
  125. bool skewEnabled; /* are we using skew optimization? */
  126. HashSkewBucket **skewBucket; /* hashtable of skew buckets */
  127. int skewBucketLen; /* size of skewBucket array (a power of 2!) */
  128. int nSkewBuckets; /* number of active skew buckets */
  129. int *skewBucketNums; /* array indexes of active skew buckets */
  130. int nbatch; /* number of batches */
  131. int curbatch; /* current batch #; 0 during 1st pass */
  132. int nbatch_original; /* nbatch when we started inner scan */
  133. int nbatch_outstart; /* nbatch when we started outer scan */
  134. bool growEnabled; /* flag to shut off nbatch increases */
  135. double totalTuples; /* # tuples obtained from inner plan */
  136. double skewTuples; /* # tuples inserted into skew tuples */
  137. /*
  138. * These arrays are allocated for the life of the hash join, but only if
  139. * nbatch > 1. A file is opened only when we first write a tuple into it
  140. * (otherwise its pointer remains NULL). Note that the zero'th array
  141. * elements never get used, since we will process rather than dump out any
  142. * tuples of batch zero.
  143. */
  144. BufFile **innerBatchFile; /* buffered virtual temp file per batch */
  145. BufFile **outerBatchFile; /* buffered virtual temp file per batch */
  146. /*
  147. * Info about the datatype-specific hash functions for the datatypes being
  148. * hashed. These are arrays of the same length as the number of hash join
  149. * clauses (hash keys).
  150. */
  151. FmgrInfo *outer_hashfunctions; /* lookup data for hash functions */
  152. FmgrInfo *inner_hashfunctions; /* lookup data for hash functions */
  153. bool *hashStrict; /* is each hash join operator strict? */
  154. Size spaceUsed; /* memory space currently used by tuples */
  155. Size spaceAllowed; /* upper limit for space used */
  156. Size spacePeak; /* peak space used */
  157. Size spaceUsedSkew; /* skew hash table's current space usage */
  158. Size spaceAllowedSkew; /* upper limit for skew hashtable */
  159. MemoryContext hashCxt; /* context for whole-hash-join storage */
  160. MemoryContext batchCxt; /* context for this-batch-only storage */
  161. /* used for dense allocation of tuples (into linked chunks) */
  162. HashMemoryChunk chunks; /* one list for the whole batch */
  163. } HashJoinTableData;
  164. #endif /* HASHJOIN_H */