diff --git a/Utils/WebUtils.h b/Utils/WebUtils.h index 29c9b6e..643c33d 100755 --- a/Utils/WebUtils.h +++ b/Utils/WebUtils.h @@ -12,8 +12,14 @@ #include #include +#include +#include +#include #include +#include +#include +#include #include "FileUtils.h" @@ -74,7 +80,16 @@ namespace Utils return out->size; } - inline + typedef struct { + size_t size = 0; + const char **resources = NULL; + } ResourceTypes; + + typedef struct { + size_t size = 0; + const char **urls = NULL; + } Urls; + Utils::FileUtils::file_body download (char *url) { Utils::FileUtils::file_body page = {0}; @@ -84,6 +99,7 @@ namespace Utils } CURL *h = curl_easy_init(); + curl_easy_setopt(h, CURLOPT_URL, url); curl_easy_setopt(h, CURLOPT_PRIVATE, url); curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L); @@ -97,37 +113,126 @@ namespace Utils } inline - void add_transfer(CURLM *cm, char *url, Utils::FileUtils::file_body *page, int *left) + void add_transfer(CURLM *cm, char *url, int *left) { CURL *h = curl_easy_init(); + + Utils::FileUtils::file_body *page = (Utils::FileUtils::file_body *) malloc(sizeof(Utils::FileUtils::file_body)); + page->size = 0; + curl_easy_setopt(h, CURLOPT_URL, url); - curl_easy_setopt(h, CURLOPT_PRIVATE, url); curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L); curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L); curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data); curl_easy_setopt(h, CURLOPT_WRITEDATA, page); + curl_easy_setopt(h, CURLOPT_PRIVATE, page); + + curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); + curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); + curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); + curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); + curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); + curl_easy_setopt(handle, CURLOPT_USERAGENT, "firefox"); + curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); + curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); + curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); + curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); curl_multi_add_handle(cm, h); ++(*left); } - Utils::FileUtils::file_body *multi_download (char **urls, int count, int max_parrallel) + size_t follow_links(CURLM *cm, Utils::FileUtils::file_body *page, char *url, int *left) { - Utils::FileUtils::file_body *pages = (Utils::FileUtils::file_body *) malloc(count * sizeof(Utils::FileUtils::file_body)); + int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET; + htmlDocPtr doc = htmlReadMemory(page->content, page->size, url, NULL, opts); + if(!doc) { + return 0; + } + + xmlChar *xpath = (xmlChar*) "//img/@src"; + xmlXPathContextPtr context = xmlXPathNewContext(doc); + xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); + xmlXPathFreeContext(context); + + if(!result) { + return 0; + } + + xmlNodeSetPtr nodeset = result->nodesetval; + if(xmlXPathNodeSetIsEmpty(nodeset)) { + xmlXPathFreeObject(result); + return 0; + } + + size_t count = 0; + int i; + + for(i = 0; i < nodeset->nodeNr; i++) { + double r = rand(); + int x = r * nodeset->nodeNr / RAND_MAX; + const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; + xmlChar *href = xmlNodeListGetString(doc, node, 1); + + // follow relative link + if(true) { + xmlChar *orig = href; + + // @todo: consider base= tag which has an impact on relative links + href = xmlBuildURI(href, (xmlChar *) url); + xmlFree(orig); + } + + char *link = (char *) href; + if(!link || strlen(link) < 10) { + continue; + } + + if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8) || !strncmp(link, "www.", 4)) { + Utils::WebUtils::add_transfer(cm, link, left); + + // limit to max 1000 links per page to follow + if(count++ == 1000) { + break; + } + } + + xmlFree(link); + } + + xmlXPathFreeObject(result); + + return count; + } + + void *multi_download( + Urls urls, + const char *baseDir, + int max_parrallel = 1, + ResourceTypes *resources = NULL + ) { if (!Utils::WebUtils::CURL_SETUP) { Utils::WebUtils::setup(); } CURLM *cm = curl_multi_init(); curl_multi_setopt(cm, CURLMOPT_MAXCONNECTS, max_parrallel); + curl_multi_setopt(cm, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_parrallel); + curl_multi_setopt(cm, CURLMOPT_MAX_HOST_CONNECTIONS, 5); - int downloads; + #ifdef CURLPIPE_MULTIPLEX + curl_multi_setopt(cm, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); + #endif + + int downloads = 0; int left = 0; - for(downloads = 0; downloads < max_parrallel && downloads < count; ++downloads) { - Utils::WebUtils::add_transfer(cm, urls[downloads], &pages[downloads], &left); + for(downloads = 0; downloads < max_parrallel && downloads < urls.size; ++downloads) { + Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left); } CURLMsg *msg; @@ -139,26 +244,49 @@ namespace Utils while((msg = curl_multi_info_read(cm, &msgs_left))) { if(msg->msg == CURLMSG_DONE) { - char *url; CURL *e = msg->easy_handle; - curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &url); + char *url; + Utils::FileUtils::file_body *page; + + curl_easy_getinfo(e, CURLINFO_PRIVATE, &page); + curl_easy_getinfo(e, CURLINFO_EFFECTIVE_URL, &url); + + if (msg->data.result == CURLE_OK) { + long statusCode = 0; + curl_easy_getinfo(e, CURLINFO_RESPONSE_CDE, &statusCode); + + if (statusCode == 200) { + char *ctype; + curl_easy_getinfo(e, CURLINFO_CONTENT_TYPE, &ctype); + + // @todo: save file (how to handle base resources, either pass base url or save based on url?) + + if (ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html")) { + // @todo: check limits + left += follow_links(cm, page, url, &left); + } + } + } + curl_multi_remove_handle(cm, e); curl_easy_cleanup(e); + free(page->content); + free(page); + --left; } else { fprintf(stderr, "E: CURLMsg (%d)\n", msg->msg); } - if(downloads < count) { + if(downloads < urls.size) { ++downloads; - add_transfer(cm, urls[downloads], &pages[downloads], &left); + Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left); } } if(left > 0) { curl_multi_wait(cm, NULL, 0, 1000, NULL); } - } while(left > 0); curl_multi_cleanup(cm); diff --git a/tests/Utils/WebUtilsTest.cpp b/tests/Utils/WebUtilsTest.cpp index 4cc2c42..774a4cc 100755 --- a/tests/Utils/WebUtilsTest.cpp +++ b/tests/Utils/WebUtilsTest.cpp @@ -25,6 +25,7 @@ int main(int argc, char** argv) free(single.content); /* Multi download */ + /* const char *urls[] = { "https://jingga.app/terms", "https://jingga.app/imprint", @@ -44,6 +45,7 @@ int main(int argc, char** argv) } free(multi); + */ Utils::WebUtils::clean(); printf("\n\n");