plugin_ftparser.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
  2. This program is free software; you can redistribute it and/or modify
  3. it under the terms of the GNU General Public License, version 2.0,
  4. as published by the Free Software Foundation.
  5. This program is also distributed with certain software (including
  6. but not limited to OpenSSL) that is licensed under separate terms,
  7. as designated in a particular file or component or in included license
  8. documentation. The authors of MySQL hereby grant you an additional
  9. permission to link the program and your derivative works with the
  10. separately licensed software that they have included with MySQL.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License, version 2.0, for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with this program; if not, write to the Free Software
  17. Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
  18. #ifndef _my_plugin_ftparser_h
  19. #define _my_plugin_ftparser_h
  20. #include "plugin.h"
  21. /*************************************************************************
  22. API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
  23. */
  24. /* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
  25. enum enum_ftparser_mode
  26. {
  27. /*
  28. Fast and simple mode. This mode is used for indexing, and natural
  29. language queries.
  30. The parser is expected to return only those words that go into the
  31. index. Stopwords or too short/long words should not be returned. The
  32. 'boolean_info' argument of mysql_add_word() does not have to be set.
  33. */
  34. MYSQL_FTPARSER_SIMPLE_MODE= 0,
  35. /*
  36. Parse with stopwords mode. This mode is used in boolean searches for
  37. "phrase matching."
  38. The parser is not allowed to ignore words in this mode. Every word
  39. should be returned, including stopwords and words that are too short
  40. or long. The 'boolean_info' argument of mysql_add_word() does not
  41. have to be set.
  42. */
  43. MYSQL_FTPARSER_WITH_STOPWORDS= 1,
  44. /*
  45. Parse in boolean mode. This mode is used to parse a boolean query string.
  46. The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
  47. structure in the 'boolean_info' argument to mysql_add_word().
  48. Usually that means that the parser should recognize boolean operators
  49. in the parsing stream and set appropriate fields in
  50. MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
  51. MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
  52. Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
  53. */
  54. MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2
  55. };
  56. /*
  57. Token types for boolean mode searching (used for the type member of
  58. MYSQL_FTPARSER_BOOLEAN_INFO struct)
  59. FT_TOKEN_EOF: End of data.
  60. FT_TOKEN_WORD: Regular word.
  61. FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
  62. FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
  63. FT_TOKEN_STOPWORD: Stopword.
  64. */
  65. enum enum_ft_token_type
  66. {
  67. FT_TOKEN_EOF= 0,
  68. FT_TOKEN_WORD= 1,
  69. FT_TOKEN_LEFT_PAREN= 2,
  70. FT_TOKEN_RIGHT_PAREN= 3,
  71. FT_TOKEN_STOPWORD= 4
  72. };
  73. /*
  74. This structure is used in boolean search mode only. It conveys
  75. boolean-mode metadata to the MySQL search engine for every word in
  76. the search query. A valid instance of this structure must be filled
  77. in by the plugin parser and passed as an argument in the call to
  78. mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
  79. structure) when a query is parsed in boolean mode.
  80. type: The token type. Should be one of the enum_ft_token_type values.
  81. yesno: Whether the word must be present for a match to occur:
  82. >0 Must be present
  83. <0 Must not be present
  84. 0 Neither; the word is optional but its presence increases the relevance
  85. With the default settings of the ft_boolean_syntax system variable,
  86. >0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
  87. and 0 means neither operator was used.
  88. weight_adjust: A weighting factor that determines how much a match
  89. for the word counts. Positive values increase, negative - decrease the
  90. relative word's importance in the query.
  91. wasign: The sign of the word's weight in the query. If it's non-negative
  92. the match for the word will increase document relevance, if it's
  93. negative - decrease (the word becomes a "noise word", the less of it the
  94. better).
  95. trunc: Corresponds to the '*' operator in the default setting of the
  96. ft_boolean_syntax system variable.
  97. position: Start position in bytes of the word in the document, used by InnoDB FTS.
  98. */
  99. typedef struct st_mysql_ftparser_boolean_info
  100. {
  101. enum enum_ft_token_type type;
  102. int yesno;
  103. int weight_adjust;
  104. char wasign;
  105. char trunc;
  106. int position;
  107. /* These are parser state and must be removed. */
  108. char prev;
  109. char *quot;
  110. } MYSQL_FTPARSER_BOOLEAN_INFO;
  111. /*
  112. The following flag means that buffer with a string (document, word)
  113. may be overwritten by the caller before the end of the parsing (that is
  114. before st_mysql_ftparser::deinit() call). If one needs the string
  115. to survive between two successive calls of the parsing function, she
  116. needs to save a copy of it. The flag may be set by MySQL before calling
  117. st_mysql_ftparser::parse(), or it may be set by a plugin before calling
  118. st_mysql_ftparser_param::mysql_parse() or
  119. st_mysql_ftparser_param::mysql_add_word().
  120. */
  121. #define MYSQL_FTFLAGS_NEED_COPY 1
  122. /*
  123. An argument of the full-text parser plugin. This structure is
  124. filled in by MySQL server and passed to the parsing function of the
  125. plugin as an in/out parameter.
  126. mysql_parse: A pointer to the built-in parser implementation of the
  127. server. It's set by the server and can be used by the parser plugin
  128. to invoke the MySQL default parser. If plugin's role is to extract
  129. textual data from .doc, .pdf or .xml content, it might extract
  130. plaintext from the content, and then pass the text to the default
  131. MySQL parser to be parsed.
  132. mysql_add_word: A server callback to add a new word. When parsing
  133. a document, the server sets this to point at a function that adds
  134. the word to MySQL full-text index. When parsing a search query,
  135. this function will add the new word to the list of words to search
  136. for. The boolean_info argument can be NULL for all cases except
  137. when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
  138. ftparser_state: A generic pointer. The plugin can set it to point
  139. to information to be used internally for its own purposes.
  140. mysql_ftparam: This is set by the server. It is used by MySQL functions
  141. called via mysql_parse() and mysql_add_word() callback. The plugin
  142. should not modify it.
  143. cs: Information about the character set of the document or query string.
  144. doc: A pointer to the document or query string to be parsed.
  145. length: Length of the document or query string, in bytes.
  146. flags: See MYSQL_FTFLAGS_* constants above.
  147. mode: The parsing mode. With boolean operators, with stopwords, or
  148. nothing. See enum_ftparser_mode above.
  149. */
  150. typedef struct st_mysql_ftparser_param
  151. {
  152. int (*mysql_parse)(struct st_mysql_ftparser_param *,
  153. char *doc, int doc_len);
  154. int (*mysql_add_word)(struct st_mysql_ftparser_param *,
  155. char *word, int word_len,
  156. MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
  157. void *ftparser_state;
  158. void *mysql_ftparam;
  159. const struct charset_info_st *cs;
  160. char *doc;
  161. int length;
  162. int flags;
  163. enum enum_ftparser_mode mode;
  164. } MYSQL_FTPARSER_PARAM;
  165. /*
  166. Full-text parser descriptor.
  167. interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
  168. The parsing, initialization, and deinitialization functions are
  169. invoked per SQL statement for which the parser is used.
  170. */
  171. struct st_mysql_ftparser
  172. {
  173. int interface_version;
  174. int (*parse)(MYSQL_FTPARSER_PARAM *param);
  175. int (*init)(MYSQL_FTPARSER_PARAM *param);
  176. int (*deinit)(MYSQL_FTPARSER_PARAM *param);
  177. };
  178. #endif