nbtree.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779
  1. /*-------------------------------------------------------------------------
  2. *
  3. * nbtree.h
  4. * header file for postgres btree access method implementation.
  5. *
  6. *
  7. * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
  8. * Portions Copyright (c) 1994, Regents of the University of California
  9. *
  10. * src/include/access/nbtree.h
  11. *
  12. *-------------------------------------------------------------------------
  13. */
  14. #ifndef NBTREE_H
  15. #define NBTREE_H
  16. #include "access/amapi.h"
  17. #include "access/itup.h"
  18. #include "access/sdir.h"
  19. #include "access/xlogreader.h"
  20. #include "catalog/pg_index.h"
  21. #include "lib/stringinfo.h"
  22. #include "storage/bufmgr.h"
  23. /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
  24. typedef uint16 BTCycleId;
  25. /*
  26. * BTPageOpaqueData -- At the end of every page, we store a pointer
  27. * to both siblings in the tree. This is used to do forward/backward
  28. * index scans. The next-page link is also critical for recovery when
  29. * a search has navigated to the wrong page due to concurrent page splits
  30. * or deletions; see src/backend/access/nbtree/README for more info.
  31. *
  32. * In addition, we store the page's btree level (counting upwards from
  33. * zero at a leaf page) as well as some flag bits indicating the page type
  34. * and status. If the page is deleted, we replace the level with the
  35. * next-transaction-ID value indicating when it is safe to reclaim the page.
  36. *
  37. * We also store a "vacuum cycle ID". When a page is split while VACUUM is
  38. * processing the index, a nonzero value associated with the VACUUM run is
  39. * stored into both halves of the split page. (If VACUUM is not running,
  40. * both pages receive zero cycleids.) This allows VACUUM to detect whether
  41. * a page was split since it started, with a small probability of false match
  42. * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
  43. * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left
  44. * (original) page, and set in the right page, but only if the next page
  45. * to its right has a different cycleid.
  46. *
  47. * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
  48. * instead.
  49. */
  50. typedef struct BTPageOpaqueData
  51. {
  52. BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
  53. BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
  54. union
  55. {
  56. uint32 level; /* tree level --- zero for leaf pages */
  57. TransactionId xact; /* next transaction ID, if deleted */
  58. } btpo;
  59. uint16 btpo_flags; /* flag bits, see below */
  60. BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
  61. } BTPageOpaqueData;
  62. typedef BTPageOpaqueData *BTPageOpaque;
  63. /* Bits defined in btpo_flags */
  64. #define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */
  65. #define BTP_ROOT (1 << 1) /* root page (has no parent) */
  66. #define BTP_DELETED (1 << 2) /* page has been deleted from tree */
  67. #define BTP_META (1 << 3) /* meta-page */
  68. #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */
  69. #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
  70. #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */
  71. #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
  72. /*
  73. * The max allowed value of a cycle ID is a bit less than 64K. This is
  74. * for convenience of pg_filedump and similar utilities: we want to use
  75. * the last 2 bytes of special space as an index type indicator, and
  76. * restricting cycle ID lets btree use that space for vacuum cycle IDs
  77. * while still allowing index type to be identified.
  78. */
  79. #define MAX_BT_CYCLE_ID 0xFF7F
  80. /*
  81. * The Meta page is always the first page in the btree index.
  82. * Its primary purpose is to point to the location of the btree root page.
  83. * We also point to the "fast" root, which is the current effective root;
  84. * see README for discussion.
  85. */
  86. typedef struct BTMetaPageData
  87. {
  88. uint32 btm_magic; /* should contain BTREE_MAGIC */
  89. uint32 btm_version; /* should contain BTREE_VERSION */
  90. BlockNumber btm_root; /* current root location */
  91. uint32 btm_level; /* tree level of the root page */
  92. BlockNumber btm_fastroot; /* current "fast" root location */
  93. uint32 btm_fastlevel; /* tree level of the "fast" root page */
  94. } BTMetaPageData;
  95. #define BTPageGetMeta(p) \
  96. ((BTMetaPageData *) PageGetContents(p))
  97. #define BTREE_METAPAGE 0 /* first page is meta */
  98. #define BTREE_MAGIC 0x053162 /* magic number of btree pages */
  99. #define BTREE_VERSION 2 /* current version number */
  100. /*
  101. * Maximum size of a btree index entry, including its tuple header.
  102. *
  103. * We actually need to be able to fit three items on every page,
  104. * so restrict any one item to 1/3 the per-page available space.
  105. */
  106. #define BTMaxItemSize(page) \
  107. MAXALIGN_DOWN((PageGetPageSize(page) - \
  108. MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
  109. MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
  110. /*
  111. * The leaf-page fillfactor defaults to 90% but is user-adjustable.
  112. * For pages above the leaf level, we use a fixed 70% fillfactor.
  113. * The fillfactor is applied during index build and when splitting
  114. * a rightmost page; when splitting non-rightmost pages we try to
  115. * divide the data equally.
  116. */
  117. #define BTREE_MIN_FILLFACTOR 10
  118. #define BTREE_DEFAULT_FILLFACTOR 90
  119. #define BTREE_NONLEAF_FILLFACTOR 70
  120. /*
  121. * Test whether two btree entries are "the same".
  122. *
  123. * Old comments:
  124. * In addition, we must guarantee that all tuples in the index are unique,
  125. * in order to satisfy some assumptions in Lehman and Yao. The way that we
  126. * do this is by generating a new OID for every insertion that we do in the
  127. * tree. This adds eight bytes to the size of btree index tuples. Note
  128. * that we do not use the OID as part of a composite key; the OID only
  129. * serves as a unique identifier for a given index tuple (logical position
  130. * within a page).
  131. *
  132. * New comments:
  133. * actually, we must guarantee that all tuples in A LEVEL
  134. * are unique, not in ALL INDEX. So, we can use the t_tid
  135. * as unique identifier for a given index tuple (logical position
  136. * within a level). - vadim 04/09/97
  137. */
  138. #define BTTidSame(i1, i2) \
  139. ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
  140. (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
  141. (i1).ip_posid == (i2).ip_posid )
  142. #define BTEntrySame(i1, i2) \
  143. BTTidSame((i1)->t_tid, (i2)->t_tid)
  144. /*
  145. * In general, the btree code tries to localize its knowledge about
  146. * page layout to a couple of routines. However, we need a special
  147. * value to indicate "no page number" in those places where we expect
  148. * page numbers. We can use zero for this because we never need to
  149. * make a pointer to the metadata page.
  150. */
  151. #define P_NONE 0
  152. /*
  153. * Macros to test whether a page is leftmost or rightmost on its tree level,
  154. * as well as other state info kept in the opaque data.
  155. */
  156. #define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
  157. #define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
  158. #define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
  159. #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
  160. #define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
  161. #define P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD)
  162. #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
  163. #define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
  164. #define P_INCOMPLETE_SPLIT(opaque) ((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
  165. /*
  166. * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
  167. * page. The high key is not a data key, but gives info about what range of
  168. * keys is supposed to be on this page. The high key on a page is required
  169. * to be greater than or equal to any data key that appears on the page.
  170. * If we find ourselves trying to insert a key > high key, we know we need
  171. * to move right (this should only happen if the page was split since we
  172. * examined the parent page).
  173. *
  174. * Our insertion algorithm guarantees that we can use the initial least key
  175. * on our right sibling as the high key. Once a page is created, its high
  176. * key changes only if the page is split.
  177. *
  178. * On a non-rightmost page, the high key lives in item 1 and data items
  179. * start in item 2. Rightmost pages have no high key, so we store data
  180. * items beginning in item 1.
  181. */
  182. #define P_HIKEY ((OffsetNumber) 1)
  183. #define P_FIRSTKEY ((OffsetNumber) 2)
  184. #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
  185. /*
  186. * XLOG records for btree operations
  187. *
  188. * XLOG allows to store some information in high 4 bits of log
  189. * record xl_info field
  190. */
  191. #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */
  192. #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */
  193. #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
  194. #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
  195. #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
  196. #define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */
  197. #define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */
  198. #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
  199. #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
  200. #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
  201. #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */
  202. #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */
  203. #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during
  204. * vacuum */
  205. #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
  206. * FSM */
  207. /*
  208. * All that we need to regenerate the meta-data page
  209. */
  210. typedef struct xl_btree_metadata
  211. {
  212. BlockNumber root;
  213. uint32 level;
  214. BlockNumber fastroot;
  215. uint32 fastlevel;
  216. } xl_btree_metadata;
  217. /*
  218. * This is what we need to know about simple (without split) insert.
  219. *
  220. * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
  221. * Note that INSERT_META implies it's not a leaf page.
  222. *
  223. * Backup Blk 0: original page (data contains the inserted tuple)
  224. * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META
  225. * Backup Blk 2: xl_btree_metadata, if INSERT_META
  226. */
  227. typedef struct xl_btree_insert
  228. {
  229. OffsetNumber offnum;
  230. } xl_btree_insert;
  231. #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
  232. /*
  233. * On insert with split, we save all the items going into the right sibling
  234. * so that we can restore it completely from the log record. This way takes
  235. * less xlog space than the normal approach, because if we did it standardly,
  236. * XLogInsert would almost always think the right page is new and store its
  237. * whole page image. The left page, however, is handled in the normal
  238. * incremental-update fashion.
  239. *
  240. * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
  241. * The _L and _R variants indicate whether the inserted tuple went into the
  242. * left or right split page (and thus, whether newitemoff and the new item
  243. * are stored or not). The _ROOT variants indicate that we are splitting
  244. * the root page, and thus that a newroot record rather than an insert or
  245. * split record should follow. Note that a split record never carries a
  246. * metapage update --- we'll do that in the parent-level update.
  247. *
  248. * Backup Blk 0: original page / new left page
  249. *
  250. * The left page's data portion contains the new item, if it's the _L variant.
  251. * (In the _R variants, the new item is one of the right page's tuples.)
  252. * If level > 0, an IndexTuple representing the HIKEY of the left page
  253. * follows. We don't need this on leaf pages, because it's the same as the
  254. * leftmost key in the new right page.
  255. *
  256. * Backup Blk 1: new right page
  257. *
  258. * The right page's data portion contains the right page's tuples in the
  259. * form used by _bt_restore_page.
  260. *
  261. * Backup Blk 2: next block (orig page's rightlink), if any
  262. * Backup Blk 3: child's left sibling, if non-leaf split
  263. */
  264. typedef struct xl_btree_split
  265. {
  266. uint32 level; /* tree level of page being split */
  267. OffsetNumber firstright; /* first item moved to right page */
  268. OffsetNumber newitemoff; /* new item's offset (if placed on left page) */
  269. } xl_btree_split;
  270. #define SizeOfBtreeSplit (offsetof(xl_btree_split, newitemoff) + sizeof(OffsetNumber))
  271. /*
  272. * This is what we need to know about delete of individual leaf index tuples.
  273. * The WAL record can represent deletion of any number of index tuples on a
  274. * single index page when *not* executed by VACUUM.
  275. *
  276. * Backup Blk 0: index page
  277. */
  278. typedef struct xl_btree_delete
  279. {
  280. RelFileNode hnode; /* RelFileNode of the heap the index currently
  281. * points at */
  282. int nitems;
  283. /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
  284. } xl_btree_delete;
  285. #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
  286. /*
  287. * This is what we need to know about page reuse within btree.
  288. */
  289. typedef struct xl_btree_reuse_page
  290. {
  291. RelFileNode node;
  292. BlockNumber block;
  293. TransactionId latestRemovedXid;
  294. } xl_btree_reuse_page;
  295. #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
  296. /*
  297. * This is what we need to know about vacuum of individual leaf index tuples.
  298. * The WAL record can represent deletion of any number of index tuples on a
  299. * single index page when executed by VACUUM.
  300. *
  301. * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
  302. * For a non-MVCC index scans there is an additional correctness requirement
  303. * for applying these changes during recovery, which is that we must do one
  304. * of these two things for every block in the index:
  305. * * lock the block for cleanup and apply any required changes
  306. * * EnsureBlockUnpinned()
  307. * The purpose of this is to ensure that no index scans started before we
  308. * finish scanning the index are still running by the time we begin to remove
  309. * heap tuples.
  310. *
  311. * Any changes to any one block are registered on just one WAL record. All
  312. * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
  313. * starting from the last block vacuumed through until this one. Individual
  314. * block numbers aren't given.
  315. *
  316. * Note that the *last* WAL record in any vacuum of an index is allowed to
  317. * have a zero length array of offsets. Earlier records must have at least one.
  318. */
  319. typedef struct xl_btree_vacuum
  320. {
  321. BlockNumber lastBlockVacuumed;
  322. /* TARGET OFFSET NUMBERS FOLLOW */
  323. } xl_btree_vacuum;
  324. #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
  325. /*
  326. * This is what we need to know about marking an empty branch for deletion.
  327. * The target identifies the tuple removed from the parent page (note that we
  328. * remove this tuple's downlink and the *following* tuple's key). Note that
  329. * the leaf page is empty, so we don't need to store its content --- it is
  330. * just reinitialized during recovery using the rest of the fields.
  331. *
  332. * Backup Blk 0: leaf block
  333. * Backup Blk 1: top parent
  334. */
  335. typedef struct xl_btree_mark_page_halfdead
  336. {
  337. OffsetNumber poffset; /* deleted tuple id in parent page */
  338. /* information needed to recreate the leaf page: */
  339. BlockNumber leafblk; /* leaf block ultimately being deleted */
  340. BlockNumber leftblk; /* leaf block's left sibling, if any */
  341. BlockNumber rightblk; /* leaf block's right sibling */
  342. BlockNumber topparent; /* topmost internal page in the branch */
  343. } xl_btree_mark_page_halfdead;
  344. #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
  345. /*
  346. * This is what we need to know about deletion of a btree page. Note we do
  347. * not store any content for the deleted page --- it is just rewritten as empty
  348. * during recovery, apart from resetting the btpo.xact.
  349. *
  350. * Backup Blk 0: target block being deleted
  351. * Backup Blk 1: target block's left sibling, if any
  352. * Backup Blk 2: target block's right sibling
  353. * Backup Blk 3: leaf block (if different from target)
  354. * Backup Blk 4: metapage (if rightsib becomes new fast root)
  355. */
  356. typedef struct xl_btree_unlink_page
  357. {
  358. BlockNumber leftsib; /* target block's left sibling, if any */
  359. BlockNumber rightsib; /* target block's right sibling */
  360. /*
  361. * Information needed to recreate the leaf page, when target is an
  362. * internal page.
  363. */
  364. BlockNumber leafleftsib;
  365. BlockNumber leafrightsib;
  366. BlockNumber topparent; /* next child down in the branch */
  367. TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
  368. /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
  369. } xl_btree_unlink_page;
  370. #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
  371. /*
  372. * New root log record. There are zero tuples if this is to establish an
  373. * empty root, or two if it is the result of splitting an old root.
  374. *
  375. * Note that although this implies rewriting the metadata page, we don't need
  376. * an xl_btree_metadata record --- the rootblk and level are sufficient.
  377. *
  378. * Backup Blk 0: new root page (2 tuples as payload, if splitting old root)
  379. * Backup Blk 1: left child (if splitting an old root)
  380. * Backup Blk 2: metapage
  381. */
  382. typedef struct xl_btree_newroot
  383. {
  384. BlockNumber rootblk; /* location of new root (redundant with blk 0) */
  385. uint32 level; /* its tree level */
  386. } xl_btree_newroot;
  387. #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
  388. /*
  389. * Operator strategy numbers for B-tree have been moved to access/stratnum.h,
  390. * because many places need to use them in ScanKeyInit() calls.
  391. *
  392. * The strategy numbers are chosen so that we can commute them by
  393. * subtraction, thus:
  394. */
  395. #define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat))
  396. /*
  397. * When a new operator class is declared, we require that the user
  398. * supply us with an amproc procedure (BTORDER_PROC) for determining
  399. * whether, for two keys a and b, a < b, a = b, or a > b. This routine
  400. * must return < 0, 0, > 0, respectively, in these three cases. (It must
  401. * not return INT_MIN, since we may negate the result before using it.)
  402. *
  403. * To facilitate accelerated sorting, an operator class may choose to
  404. * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see
  405. * src/include/utils/sortsupport.h.
  406. */
  407. #define BTORDER_PROC 1
  408. #define BTSORTSUPPORT_PROC 2
  409. #define BTNProcs 2
  410. /*
  411. * We need to be able to tell the difference between read and write
  412. * requests for pages, in order to do locking correctly.
  413. */
  414. #define BT_READ BUFFER_LOCK_SHARE
  415. #define BT_WRITE BUFFER_LOCK_EXCLUSIVE
  416. /*
  417. * BTStackData -- As we descend a tree, we push the (location, downlink)
  418. * pairs from internal pages onto a private stack. If we split a
  419. * leaf, we use this stack to walk back up the tree and insert data
  420. * into parent pages (and possibly to split them, too). Lehman and
  421. * Yao's update algorithm guarantees that under no circumstances can
  422. * our private stack give us an irredeemably bad picture up the tree.
  423. * Again, see the paper for details.
  424. */
  425. typedef struct BTStackData
  426. {
  427. BlockNumber bts_blkno;
  428. OffsetNumber bts_offset;
  429. IndexTupleData bts_btentry;
  430. struct BTStackData *bts_parent;
  431. } BTStackData;
  432. typedef BTStackData *BTStack;
  433. /*
  434. * BTScanOpaqueData is the btree-private state needed for an indexscan.
  435. * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
  436. * details of the preprocessing), information about the current location
  437. * of the scan, and information about the marked location, if any. (We use
  438. * BTScanPosData to represent the data needed for each of current and marked
  439. * locations.) In addition we can remember some known-killed index entries
  440. * that must be marked before we can move off the current page.
  441. *
  442. * Index scans work a page at a time: we pin and read-lock the page, identify
  443. * all the matching items on the page and save them in BTScanPosData, then
  444. * release the read-lock while returning the items to the caller for
  445. * processing. This approach minimizes lock/unlock traffic. Note that we
  446. * keep the pin on the index page until the caller is done with all the items
  447. * (this is needed for VACUUM synchronization, see nbtree/README). When we
  448. * are ready to step to the next page, if the caller has told us any of the
  449. * items were killed, we re-lock the page to mark them killed, then unlock.
  450. * Finally we drop the pin and step to the next page in the appropriate
  451. * direction.
  452. *
  453. * If we are doing an index-only scan, we save the entire IndexTuple for each
  454. * matched item, otherwise only its heap TID and offset. The IndexTuples go
  455. * into a separate workspace array; each BTScanPosItem stores its tuple's
  456. * offset within that array.
  457. */
  458. typedef struct BTScanPosItem /* what we remember about each match */
  459. {
  460. ItemPointerData heapTid; /* TID of referenced heap item */
  461. OffsetNumber indexOffset; /* index item's location within page */
  462. LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
  463. } BTScanPosItem;
  464. typedef struct BTScanPosData
  465. {
  466. Buffer buf; /* if valid, the buffer is pinned */
  467. XLogRecPtr lsn; /* pos in the WAL stream when page was read */
  468. BlockNumber currPage; /* page referenced by items array */
  469. BlockNumber nextPage; /* page's right link when we scanned it */
  470. /*
  471. * moreLeft and moreRight track whether we think there may be matching
  472. * index entries to the left and right of the current page, respectively.
  473. * We can clear the appropriate one of these flags when _bt_checkkeys()
  474. * returns continuescan = false.
  475. */
  476. bool moreLeft;
  477. bool moreRight;
  478. /*
  479. * If we are doing an index-only scan, nextTupleOffset is the first free
  480. * location in the associated tuple storage workspace.
  481. */
  482. int nextTupleOffset;
  483. /*
  484. * The items array is always ordered in index order (ie, increasing
  485. * indexoffset). When scanning backwards it is convenient to fill the
  486. * array back-to-front, so we start at the last slot and fill downwards.
  487. * Hence we need both a first-valid-entry and a last-valid-entry counter.
  488. * itemIndex is a cursor showing which entry was last returned to caller.
  489. */
  490. int firstItem; /* first valid index in items[] */
  491. int lastItem; /* last valid index in items[] */
  492. int itemIndex; /* current index in items[] */
  493. BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
  494. } BTScanPosData;
  495. typedef BTScanPosData *BTScanPos;
  496. #define BTScanPosIsPinned(scanpos) \
  497. ( \
  498. AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
  499. !BufferIsValid((scanpos).buf)), \
  500. BufferIsValid((scanpos).buf) \
  501. )
  502. #define BTScanPosUnpin(scanpos) \
  503. do { \
  504. ReleaseBuffer((scanpos).buf); \
  505. (scanpos).buf = InvalidBuffer; \
  506. } while (0)
  507. #define BTScanPosUnpinIfPinned(scanpos) \
  508. do { \
  509. if (BTScanPosIsPinned(scanpos)) \
  510. BTScanPosUnpin(scanpos); \
  511. } while (0)
  512. #define BTScanPosIsValid(scanpos) \
  513. ( \
  514. AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
  515. !BufferIsValid((scanpos).buf)), \
  516. BlockNumberIsValid((scanpos).currPage) \
  517. )
  518. #define BTScanPosInvalidate(scanpos) \
  519. do { \
  520. (scanpos).currPage = InvalidBlockNumber; \
  521. (scanpos).nextPage = InvalidBlockNumber; \
  522. (scanpos).buf = InvalidBuffer; \
  523. (scanpos).lsn = InvalidXLogRecPtr; \
  524. (scanpos).nextTupleOffset = 0; \
  525. } while (0);
  526. /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
  527. typedef struct BTArrayKeyInfo
  528. {
  529. int scan_key; /* index of associated key in arrayKeyData */
  530. int cur_elem; /* index of current element in elem_values */
  531. int mark_elem; /* index of marked element in elem_values */
  532. int num_elems; /* number of elems in current array value */
  533. Datum *elem_values; /* array of num_elems Datums */
  534. } BTArrayKeyInfo;
  535. typedef struct BTScanOpaqueData
  536. {
  537. /* these fields are set by _bt_preprocess_keys(): */
  538. bool qual_ok; /* false if qual can never be satisfied */
  539. int numberOfKeys; /* number of preprocessed scan keys */
  540. ScanKey keyData; /* array of preprocessed scan keys */
  541. /* workspace for SK_SEARCHARRAY support */
  542. ScanKey arrayKeyData; /* modified copy of scan->keyData */
  543. int numArrayKeys; /* number of equality-type array keys (-1 if
  544. * there are any unsatisfiable array keys) */
  545. BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */
  546. MemoryContext arrayContext; /* scan-lifespan context for array data */
  547. /* info about killed items if any (killedItems is NULL if never used) */
  548. int *killedItems; /* currPos.items indexes of killed items */
  549. int numKilled; /* number of currently stored items */
  550. /*
  551. * If we are doing an index-only scan, these are the tuple storage
  552. * workspaces for the currPos and markPos respectively. Each is of size
  553. * BLCKSZ, so it can hold as much as a full page's worth of tuples.
  554. */
  555. char *currTuples; /* tuple storage for currPos */
  556. char *markTuples; /* tuple storage for markPos */
  557. /*
  558. * If the marked position is on the same page as current position, we
  559. * don't use markPos, but just keep the marked itemIndex in markItemIndex
  560. * (all the rest of currPos is valid for the mark position). Hence, to
  561. * determine if there is a mark, first look at markItemIndex, then at
  562. * markPos.
  563. */
  564. int markItemIndex; /* itemIndex, or -1 if not valid */
  565. /* keep these last in struct for efficiency */
  566. BTScanPosData currPos; /* current position data */
  567. BTScanPosData markPos; /* marked position, if any */
  568. } BTScanOpaqueData;
  569. typedef BTScanOpaqueData *BTScanOpaque;
  570. /*
  571. * We use some private sk_flags bits in preprocessed scan keys. We're allowed
  572. * to use bits 16-31 (see skey.h). The uppermost bits are copied from the
  573. * index's indoption[] array entry for the index attribute.
  574. */
  575. #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */
  576. #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */
  577. #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */
  578. #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
  579. #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
  580. /*
  581. * prototypes for functions in nbtree.c (external entry points for btree)
  582. */
  583. extern Datum bthandler(PG_FUNCTION_ARGS);
  584. extern IndexBuildResult *btbuild(Relation heap, Relation index,
  585. struct IndexInfo *indexInfo);
  586. extern void btbuildempty(Relation index);
  587. extern bool btinsert(Relation rel, Datum *values, bool *isnull,
  588. ItemPointer ht_ctid, Relation heapRel,
  589. IndexUniqueCheck checkUnique);
  590. extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
  591. extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
  592. extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
  593. extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
  594. ScanKey orderbys, int norderbys);
  595. extern void btendscan(IndexScanDesc scan);
  596. extern void btmarkpos(IndexScanDesc scan);
  597. extern void btrestrpos(IndexScanDesc scan);
  598. extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
  599. IndexBulkDeleteResult *stats,
  600. IndexBulkDeleteCallback callback,
  601. void *callback_state);
  602. extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
  603. IndexBulkDeleteResult *stats);
  604. extern bool btcanreturn(Relation index, int attno);
  605. /*
  606. * prototypes for functions in nbtinsert.c
  607. */
  608. extern bool _bt_doinsert(Relation rel, IndexTuple itup,
  609. IndexUniqueCheck checkUnique, Relation heapRel);
  610. extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
  611. extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
  612. /*
  613. * prototypes for functions in nbtpage.c
  614. */
  615. extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
  616. extern Buffer _bt_getroot(Relation rel, int access);
  617. extern Buffer _bt_gettrueroot(Relation rel);
  618. extern int _bt_getrootheight(Relation rel);
  619. extern void _bt_checkpage(Relation rel, Buffer buf);
  620. extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
  621. extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
  622. BlockNumber blkno, int access);
  623. extern void _bt_relbuf(Relation rel, Buffer buf);
  624. extern void _bt_pageinit(Page page, Size size);
  625. extern bool _bt_page_recyclable(Page page);
  626. extern void _bt_delitems_delete(Relation rel, Buffer buf,
  627. OffsetNumber *itemnos, int nitems, Relation heapRel);
  628. extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
  629. OffsetNumber *itemnos, int nitems,
  630. BlockNumber lastBlockVacuumed);
  631. extern int _bt_pagedel(Relation rel, Buffer buf);
  632. /*
  633. * prototypes for functions in nbtsearch.c
  634. */
  635. extern BTStack _bt_search(Relation rel,
  636. int keysz, ScanKey scankey, bool nextkey,
  637. Buffer *bufP, int access, Snapshot snapshot);
  638. extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
  639. ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
  640. int access, Snapshot snapshot);
  641. extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
  642. ScanKey scankey, bool nextkey);
  643. extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
  644. Page page, OffsetNumber offnum);
  645. extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
  646. extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
  647. extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
  648. Snapshot snapshot);
  649. /*
  650. * prototypes for functions in nbtutils.c
  651. */
  652. extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
  653. extern ScanKey _bt_mkscankey_nodata(Relation rel);
  654. extern void _bt_freeskey(ScanKey skey);
  655. extern void _bt_freestack(BTStack stack);
  656. extern void _bt_preprocess_array_keys(IndexScanDesc scan);
  657. extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
  658. extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
  659. extern void _bt_mark_array_keys(IndexScanDesc scan);
  660. extern void _bt_restore_array_keys(IndexScanDesc scan);
  661. extern void _bt_preprocess_keys(IndexScanDesc scan);
  662. extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
  663. Page page, OffsetNumber offnum,
  664. ScanDirection dir, bool *continuescan);
  665. extern void _bt_killitems(IndexScanDesc scan);
  666. extern BTCycleId _bt_vacuum_cycleid(Relation rel);
  667. extern BTCycleId _bt_start_vacuum(Relation rel);
  668. extern void _bt_end_vacuum(Relation rel);
  669. extern void _bt_end_vacuum_callback(int code, Datum arg);
  670. extern Size BTreeShmemSize(void);
  671. extern void BTreeShmemInit(void);
  672. extern bytea *btoptions(Datum reloptions, bool validate);
  673. extern bool btproperty(Oid index_oid, int attno,
  674. IndexAMProperty prop, const char *propname,
  675. bool *res, bool *isnull);
  676. /*
  677. * prototypes for functions in nbtvalidate.c
  678. */
  679. extern bool btvalidate(Oid opclassoid);
  680. /*
  681. * prototypes for functions in nbtsort.c
  682. */
  683. typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
  684. extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
  685. bool isunique, bool isdead);
  686. extern void _bt_spooldestroy(BTSpool *btspool);
  687. extern void _bt_spool(BTSpool *btspool, ItemPointer self,
  688. Datum *values, bool *isnull);
  689. extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
  690. /*
  691. * prototypes for functions in nbtxlog.c
  692. */
  693. extern void btree_redo(XLogReaderState *record);
  694. extern void btree_desc(StringInfo buf, XLogReaderState *record);
  695. extern const char *btree_identify(uint8 info);
  696. #endif /* NBTREE_H */