From ff159bf30ac1a213c3dd0b0714c7cd07db616ad2 Mon Sep 17 00:00:00 2001
From: Artem Bulavin <hdr.dmc@gmail.com>
Date: Tue, 16 Aug 2016 16:23:55 +0200
Subject: [PATCH] Simple mongoose http urls crawler

PUBLISHED_FROM=4eead54610606827963e7c244fcd8ab9a13d4c07
---
 examples/simple_crawler/Makefile         |  3 +
 examples/simple_crawler/simple_crawler.c | 91 ++++++++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 examples/simple_crawler/Makefile
 create mode 100644 examples/simple_crawler/simple_crawler.c

diff --git a/examples/simple_crawler/Makefile b/examples/simple_crawler/Makefile
new file mode 100644
index 000000000..fead37705
--- /dev/null
+++ b/examples/simple_crawler/Makefile
@@ -0,0 +1,3 @@
+PROG = simple_crawler
+MODULE_CFLAGS = ../../../slre/slre.c
+include ../examples.mk
diff --git a/examples/simple_crawler/simple_crawler.c b/examples/simple_crawler/simple_crawler.c
new file mode 100644
index 000000000..55ffabd2e
--- /dev/null
+++ b/examples/simple_crawler/simple_crawler.c
@@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <string.h>
+#include "mongoose.h"
+#include "../../../slre/slre.h"
+
+static const char *regex = "href=\"((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)";
+const int max_depth = 2;
+
+struct userdata {
+  char *url;
+  int depth;
+};
+
+void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len, int depth);
+void handle_reply(struct mg_connection *nc, struct http_message *hm);
+
+static void event_handler(struct mg_connection *nc, int event, void *data) {
+  struct http_message *hm = (struct http_message *) data;
+  int connect_status;
+
+  switch (event) {
+    case MG_EV_CONNECT:
+      connect_status = *(int *) data;
+      if (connect_status != 0) {
+        printf("Error while loading page: %s, error: %s\n", ((struct userdata*) nc->user_data)->url, strerror(connect_status));
+      }
+      break;
+    case MG_EV_CLOSE:
+      free(((struct userdata*) nc->user_data)->url);
+      free(nc->user_data);
+      break;
+    case MG_EV_HTTP_REPLY:
+      handle_reply(nc, hm);
+      nc->flags |= MG_F_SEND_AND_CLOSE;
+      break;
+    default:
+      break;
+  }
+}
+
+int main() {
+  struct mg_mgr mgr;
+
+  mg_mgr_init(&mgr, NULL);
+  crawl_page(&mgr, "http://www.simpleweb.org/", ~0, 0);
+
+  for (;;) {
+    mg_mgr_poll(&mgr, 1000);
+  }
+
+  mg_mgr_free(&mgr);
+
+  return 0;
+}
+
+void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len, int depth) {
+  struct mg_connection *nc;
+  struct userdata *data = malloc(sizeof(struct userdata));
+
+  if (url_len == (size_t) ~0) {
+    url_len = strlen(url);
+  }
+
+  data->url = strncpy(malloc(url_len + 1), url, url_len);
+  data->url[url_len] = '\0';
+  data->depth = depth;
+
+  nc = mg_connect_http(mgr, event_handler, url, NULL, NULL);
+  nc->user_data = data;
+}
+
+void handle_reply(struct mg_connection *nc, struct http_message *hm) {
+  struct userdata *ud = (struct userdata *) nc->user_data;
+  const char *body = hm->body.p;
+
+  int offset,
+    max_matches = 2,
+    cursor = 0,
+    str_len = strlen(body);
+  struct slre_cap caps[max_matches];
+
+  printf("Loaded url: %s at depth %d\n", ud->url, ud->depth);
+  if (ud->depth == max_depth) {
+    return;
+  }
+
+  while (cursor < str_len && (offset = slre_match(regex, body + cursor, str_len - cursor, caps, max_matches, SLRE_IGNORE_CASE)) > 0) {
+    crawl_page(nc->mgr, caps[0].ptr, caps[0].len, ud->depth + 1);
+    cursor += offset;
+  }
+}
-- 
GitLab