From ff159bf30ac1a213c3dd0b0714c7cd07db616ad2 Mon Sep 17 00:00:00 2001 From: Artem Bulavin <hdr.dmc@gmail.com> Date: Tue, 16 Aug 2016 16:23:55 +0200 Subject: [PATCH] Simple mongoose http urls crawler PUBLISHED_FROM=4eead54610606827963e7c244fcd8ab9a13d4c07 --- examples/simple_crawler/Makefile | 3 + examples/simple_crawler/simple_crawler.c | 91 ++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 examples/simple_crawler/Makefile create mode 100644 examples/simple_crawler/simple_crawler.c diff --git a/examples/simple_crawler/Makefile b/examples/simple_crawler/Makefile new file mode 100644 index 000000000..fead37705 --- /dev/null +++ b/examples/simple_crawler/Makefile @@ -0,0 +1,3 @@ +PROG = simple_crawler +MODULE_CFLAGS = ../../../slre/slre.c +include ../examples.mk diff --git a/examples/simple_crawler/simple_crawler.c b/examples/simple_crawler/simple_crawler.c new file mode 100644 index 000000000..55ffabd2e --- /dev/null +++ b/examples/simple_crawler/simple_crawler.c @@ -0,0 +1,91 @@ +#include <stdio.h> +#include <string.h> +#include "mongoose.h" +#include "../../../slre/slre.h" + +static const char *regex = "href=\"((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)"; +const int max_depth = 2; + +struct userdata { + char *url; + int depth; +}; + +void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len, int depth); +void handle_reply(struct mg_connection *nc, struct http_message *hm); + +static void event_handler(struct mg_connection *nc, int event, void *data) { + struct http_message *hm = (struct http_message *) data; + int connect_status; + + switch (event) { + case MG_EV_CONNECT: + connect_status = *(int *) data; + if (connect_status != 0) { + printf("Error while loading page: %s, error: %s\n", ((struct userdata*) nc->user_data)->url, strerror(connect_status)); + } + break; + case MG_EV_CLOSE: + free(((struct userdata*) nc->user_data)->url); + free(nc->user_data); + break; + case MG_EV_HTTP_REPLY: + handle_reply(nc, hm); + nc->flags |= MG_F_SEND_AND_CLOSE; + break; + default: + break; + } +} + +int main() { + struct mg_mgr mgr; + + mg_mgr_init(&mgr, NULL); + crawl_page(&mgr, "http://www.simpleweb.org/", ~0, 0); + + for (;;) { + mg_mgr_poll(&mgr, 1000); + } + + mg_mgr_free(&mgr); + + return 0; +} + +void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len, int depth) { + struct mg_connection *nc; + struct userdata *data = malloc(sizeof(struct userdata)); + + if (url_len == (size_t) ~0) { + url_len = strlen(url); + } + + data->url = strncpy(malloc(url_len + 1), url, url_len); + data->url[url_len] = '\0'; + data->depth = depth; + + nc = mg_connect_http(mgr, event_handler, url, NULL, NULL); + nc->user_data = data; +} + +void handle_reply(struct mg_connection *nc, struct http_message *hm) { + struct userdata *ud = (struct userdata *) nc->user_data; + const char *body = hm->body.p; + + int offset, + max_matches = 2, + cursor = 0, + str_len = strlen(body); + struct slre_cap caps[max_matches]; + + printf("Loaded url: %s at depth %d\n", ud->url, ud->depth); + if (ud->depth == max_depth) { + return; + } + + while (cursor < str_len && (offset = slre_match(regex, body + cursor, str_len - cursor, caps, max_matches, SLRE_IGNORE_CASE)) > 0) { + crawl_page(nc->mgr, caps[0].ptr, caps[0].len, ud->depth + 1); + cursor += offset; + } +} -- GitLab