Copy the text presentation of the US Constitution from the sample
directory
of the Apache Lucy distribution to the base level of your web server’s
htdocs
directory.
$ cp -R sample/us_constitution /usr/local/apache2/htdocs/
Our first task will be to create an application called indexer.pl
which
builds a searchable “inverted index” from a collection of documents.
After we specify some configuration variables and load all necessary modules…
#include <dirent.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CFISH_USE_SHORT_NAMES
#define LUCY_USE_SHORT_NAMES
#include "Clownfish/String.h"
#include "Lucy/Simple.h"
#include "Lucy/Document/Doc.h"
const char path_to_index[] = "lucy_index";
const char uscon_source[] = "../../common/sample/us_constitution";
… we’ll start by creating a Lucy::Simple object, telling it where we’d like the index to be located and the language of the source material.
int
main() {
// Initialize the library.
lucy_bootstrap_parcel();
String *folder = Str_newf("%s", path_to_index);
String *language = Str_newf("en");
Simple *lucy = Simple_new((Obj*)folder, language);
Next, we’ll add a subroutine which parses our sample documents.
Doc*
S_parse_file(const char *filename) {
size_t bytes = strlen(uscon_source) + 1 + strlen(filename) + 1;
char *path = (char*)malloc(bytes);
path[0] = '\0';
strcat(path, uscon_source);
strcat(path, "/");
strcat(path, filename);
FILE *stream = fopen(path, "r");
if (stream == NULL) {
perror(path);
exit(1);
}
char *title = NULL;
char *bodytext = NULL;
if (fscanf(stream, "%m[^\r\n] %m[\x01-\x7F]", &title, &bodytext) != 2) {
fprintf(stderr, "Can't extract title/bodytext from '%s'", path);
exit(1);
}
Doc *doc = Doc_new(NULL, 0);
{
// Store 'title' field
String *field = Str_newf("title");
String *value = Str_new_from_utf8(title, strlen(title));
Doc_Store(doc, field, (Obj*)value);
DECREF(field);
DECREF(value);
}
{
// Store 'content' field
String *field = Str_newf("content");
String *value = Str_new_from_utf8(bodytext, strlen(bodytext));
Doc_Store(doc, field, (Obj*)value);
DECREF(field);
DECREF(value);
}
{
// Store 'url' field
String *field = Str_newf("url");
String *value = Str_new_from_utf8(filename, strlen(filename));
Doc_Store(doc, field, (Obj*)value);
DECREF(field);
DECREF(value);
}
fclose(stream);
free(bodytext);
free(title);
free(path);
return doc;
}
Add some elementary directory reading code…
DIR *dir = opendir(uscon_source);
if (dir == NULL) {
perror(uscon_source);
return 1;
}
… and now we’re ready for the meat of indexer.pl – which occupies exactly one line of code.
for (struct dirent *entry = readdir(dir);
entry;
entry = readdir(dir)) {
if (S_ends_with(entry->d_name, ".txt")) {
Doc *doc = S_parse_file(entry->d_name);
Simple_Add_Doc(lucy, doc); // ta-da!
DECREF(doc);
}
}
closedir(dir);
DECREF(lucy);
DECREF(language);
DECREF(folder);
return 0;
}
As with our indexing app, the bulk of the code in our search script won’t be Lucy-specific.
The beginning is dedicated to CGI processing and configuration.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CFISH_USE_SHORT_NAMES
#define LUCY_USE_SHORT_NAMES
#include "Clownfish/String.h"
#include "Lucy/Document/HitDoc.h"
#include "Lucy/Simple.h"
const char path_to_index[] = "lucy_index";
static void
S_usage_and_exit(const char *arg0) {
printf("Usage: %s <querystring>\n", arg0);
exit(1);
}
int
main(int argc, char *argv[]) {
// Initialize the library.
lucy_bootstrap_parcel();
if (argc != 2) {
S_usage_and_exit(argv[0]);
}
const char *query_c = argv[1];
printf("Searching for: %s\n\n", query_c);
Once that’s out of the way, we create our Lucy::Simple object and feed it a query string.
String *folder = Str_newf("%s", path_to_index);
String *language = Str_newf("en");
Simple *lucy = Simple_new((Obj*)folder, language);
String *query_str = Str_newf("%s", query_c);
Simple_Search(lucy, query_str, 0, 10);
The value returned by Search() is the total number of documents
in the collection which matched the query. We’ll show this hit count to the
user, and also use it in conjunction with the parameters offset
and
num_wanted
to break up results into “pages” of manageable size.
Calling Search() on our Simple object turns it into an iterator. Invoking Next() now returns hits one at a time as HitDoc objects, starting with the most relevant.
String *title_str = Str_newf("title");
String *url_str = Str_newf("url");
HitDoc *hit;
int i = 1;
// Loop over search results.
while (NULL != (hit = Simple_Next(lucy))) {
String *title = (String*)HitDoc_Extract(hit, title_str);
char *title_c = Str_To_Utf8(title);
String *url = (String*)HitDoc_Extract(hit, url_str);
char *url_c = Str_To_Utf8(url);
printf("Result %d: %s (%s)\n", i, title_c, url_c);
free(url_c);
free(title_c);
DECREF(url);
DECREF(title);
DECREF(hit);
i++;
}
DECREF(url_str);
DECREF(title_str);
DECREF(query_str);
DECREF(lucy);
DECREF(language);
DECREF(folder);
return 0;
}
The rest of the script is just text wrangling.
Code example for C is missing
Lucy::Simple is perfectly adequate for some tasks, but it’s not very flexible. Many people find that it doesn’t do at least one or two things they can’t live without.
In our next tutorial chapter, BeyondSimpleTutorial, we’ll rewrite our indexing and search scripts using the classes that Lucy::Simple hides from view, opening up the possibilities for expansion; then, we’ll spend the rest of the tutorial chapters exploring these possibilities.
Copyright © 2010-2015 The Apache Software Foundation, Licensed under the
Apache License, Version 2.0.
Apache Lucy, Lucy, Apache, the Apache feather logo, and the Apache Lucy project logo are trademarks of The
Apache Software Foundation. All other marks mentioned may be trademarks or registered trademarks of their
respective owners.