English | Japanese
Last Modified: 2002-10-22 (Since: 2002-10-22)
Construct the suffix array for the file_name with index points assigned to character by character in UTF-8 encoding and store the resulting suffix array to the file named file_name.ary. Consult mksary.c for a detailed example.
#include <stdlib.h> #include <errno.h> #include <sary.h> int main (int argc, char **argv) { char *file_name; SaryInt ipoints; gboolean status; SaryBuilder *builder; if (argc != 2) exit(2); file_name = argv[1]; builder = sary_builder_new(file_name); sary_builder_set_ipoint_func(builder, sary_ipoint_char_utf8); ipoints = sary_builder_index(builder); if (ipoints == -1) { g_print("error: %s(.ary): %s\n", file_name, g_strerror(errno)); exit(2); } status = sary_builder_sort(builder); if (status == FALSE) { g_print("error: %s(.ary): %s\n", file_name, g_strerror(errno)); exit(2); } sary_builder_destroy(builder); return 0; }
Search the file_name for the pattern and sort the results in occurrence order and print them line by line. Suffix array made for file_name MUST exist. Consult sary.c for a detailed example.
#include <stdlib.h> #include <errno.h> #include <sary.h> int main (int argc, char **argv) { SarySearcher *searcher; char *pattern; char *file_name; if (argc != 3) exit(2); pattern = argv[1]; file_name = argv[2]; searcher = sary_searcher_new(file_name); if (searcher == NULL) { g_print("error: %s(.ary): %s\n", file_name, g_strerror(errno)); exit(2); } if (sary_searcher_search(searcher, pattern)) { gchar *line; sary_searcher_sort_occurrences(searcher); while ((line = sary_searcher_get_next_line(searcher))) { g_print("%s", line); g_free(line); } } sary_searcher_destroy(searcher); return 0; }
Compiling program.c which uses libsary can be done as the following. Use of autoconf, automake, and libtool is recommended for the real development.
% gcc program.c -o program `pkg-config sary --libs` `pkg-config sary --cflags`
SaryBuilder* sary_builder_new (const gchar *file_name)
SaryBuilder* sary_builder_new2 (const gchar *file_name, const gchar *array_name);
void sary_builder_destroy (SaryBuilder *builder);
void sary_builder_set_ipoint_func (SaryBuilder *builder, SaryIpointFunc ipoint_func);
SaryInt sary_builder_index (SaryBuilder *builder);
gboolean sary_builder_sort (SaryBuilder *builder);
gboolean sary_builder_block_sort (SaryBuilder *builder);
void sary_builder_set_block_size (SaryBuilder *builder, SaryInt block_size);
void sary_builder_set_nthreads (SaryBuilder *builder, SaryInt nthreads);
void sary_builder_connect_progress (SaryBuilder *builder, SaryProgressFunc progress_func, gpointer progress_func_data);
gchar* sary_ipoint_char_ascii (SaryText *text);
gchar* sary_ipoint_char_eucjp (SaryText *text);
gchar* sary_ipoint_char_sjis (SaryText *text);
gchar* sary_ipoint_char_utf8 (SaryText *text);
gchar* sary_ipoint_line (SaryText *text);
gchar* sary_ipoint_word (SaryText *text);
NOTE: sary_searcher_search2, sary_searcher_get_next_line2, sary_searcher_get_next_context_lines2, and sary_searcher_get_next_tagged_region2 are especially prepared for ones who want to write bindings for scripting languages or performance enthusiasts. Since each string in these functions is handled with its length, the string can contain '\0' in it. No strings are newly created.
SarySearcher* sary_searcher_new (const gchar *file_name);
SarySearcher* sary_searcher_new2 (const gchar *file_name, const gchar *array_name)
void sary_searcher_destroy (SarySearcher *searcher);
void sary_searcher_enable_cache (SarySearcher *searcher);
gboolean sary_searcher_search (SarySearcher *searcher, const gchar *pattern);
gboolean sary_searcher_search2 (SarySearcher *searcher, const gchar *pattern, SaryInt len);
gboolean sary_searcher_isearch (SarySearcher *searcher, const gchar *pattern, SaryInt len);
gboolean sary_searcher_isearch_reset (SarySearcher *searcher)
gboolean sary_searcher_icase_search (SarySearcher *searcher, const gchar *pattern);
gboolean sary_searcher_icase_search2 (SarySearcher *searcher, const gchar *pattern, SaryInt len);
SaryText* sary_searcher_get_text (SarySearcher *searcher);
SaryMmap* sary_searcher_get_array (SarySearcher *searcher);
gchar* sary_searcher_get_next_line (SarySearcher *searcher);
gchar* sary_searcher_get_next_line2 (SarySearcher *searcher, SaryInt *len);
gchar* sary_searcher_get_next_context_lines (SarySearcher *searcher, gint backward, gint forward);
gchar* sary_searcher_get_next_context_lines2 (SarySearcher *searcher, SaryInt backward, SaryInt forward, SaryInt *len);
gchar* sary_searcher_get_next_tagged_region (SarySearcher *searcher, const gchar *start_tag, const gchar *end_tag);
gchar* sary_searcher_get_next_tagged_region2 (SarySearcher *searcher, const gchar *start_tag, SaryInt start_tag_len, const gchar *end_tag, SaryInt end_tag_len, SaryInt *len);
Get the next search result as tagged regions between start_tad and end_tag (including start_tag and end_tag) as a pointer. Store the length of the string into len. The all results can be retrieved by calling the functions continuously. Return NULL if no more results.SaryText* sary_searcher_get_next_occurrence (SarySearcher *searcher);
SaryInt sary_searcher_get_next_position (SarySearcher *searcher);
SaryInt sary_searcher_count_occurrences (SarySearcher *searcher);
void sary_searcher_sort_occurrences (SarySearcher *searcher);
SaryText object is used for text processing. The object has the state called cursor. Operations for the object are performed with the cursor state.
SaryText* sary_text_new (const gchar *file_name);
void sary_text_destroy (SaryText *text);
SaryInt sary_text_get_lineno (SaryText *text);
void sary_text_set_lineno (SaryText *text, SaryInt lineno);
SaryInt sary_text_get_linelen (SaryText *text);
gchar* sary_text_get_line (SaryText *text);
gchar* sary_text_get_region (SaryText *cursor, SaryInt len);
gboolean sary_text_is_eof (SaryText *text);
gchar* sary_text_get_cursor (SaryText *text);
void sary_text_set_cursor (SaryText *text, gchar *cursor);
gchar* sary_text_get_bof (SaryText *text);
gchar* sary_text_get_eof (SaryText *text);
gchar* sary_text_goto_next_line (SaryText *text);
gchar* sary_text_goto_next_word (SaryText *text);
gchar* sary_text_goto_bol (SaryText *text);
gchar* sary_text_goto_eol (SaryText *text);
gchar* sary_text_forward_cursor (SaryText *text, SaryInt step);
gchar* sary_text_backward_cursor (SaryText *text, SaryInt step);
Please consult progress_bar function in mksary.c
Assigning of index points can be performed easily with scripting languages such as Perl. Application of the languages help if advanced text processing is needed. The following example shows the way to assign index points line by line.
% cat line-indexer.pl $offset = 0; while (<>) { print pack 'N', $offset; $offset += length; } % perl line-indexer.pl foobar.txt > foobar.txt.ary
Then, sort the resulting ary file to construct the suffix array. This work can be done with mkary -s command.
% mksary -s foobar.txt