update download behaviour

This commit is contained in:
Dennis Eichhorn 2022-12-29 17:48:03 +01:00
parent f8878d757a
commit 30d84d59bb
2 changed files with 143 additions and 13 deletions

View File

@ -12,8 +12,14 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <math.h>
#include <signal.h>
#include <curl/curl.h> #include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/uri.h>
#include "FileUtils.h" #include "FileUtils.h"
@ -74,7 +80,16 @@ namespace Utils
return out->size; return out->size;
} }
inline typedef struct {
size_t size = 0;
const char **resources = NULL;
} ResourceTypes;
typedef struct {
size_t size = 0;
const char **urls = NULL;
} Urls;
Utils::FileUtils::file_body download (char *url) Utils::FileUtils::file_body download (char *url)
{ {
Utils::FileUtils::file_body page = {0}; Utils::FileUtils::file_body page = {0};
@ -84,6 +99,7 @@ namespace Utils
} }
CURL *h = curl_easy_init(); CURL *h = curl_easy_init();
curl_easy_setopt(h, CURLOPT_URL, url); curl_easy_setopt(h, CURLOPT_URL, url);
curl_easy_setopt(h, CURLOPT_PRIVATE, url); curl_easy_setopt(h, CURLOPT_PRIVATE, url);
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L); curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
@ -97,37 +113,126 @@ namespace Utils
} }
inline inline
void add_transfer(CURLM *cm, char *url, Utils::FileUtils::file_body *page, int *left) void add_transfer(CURLM *cm, char *url, int *left)
{ {
CURL *h = curl_easy_init(); CURL *h = curl_easy_init();
Utils::FileUtils::file_body *page = (Utils::FileUtils::file_body *) malloc(sizeof(Utils::FileUtils::file_body));
page->size = 0;
curl_easy_setopt(h, CURLOPT_URL, url); curl_easy_setopt(h, CURLOPT_URL, url);
curl_easy_setopt(h, CURLOPT_PRIVATE, url);
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L); curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L); curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data); curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data);
curl_easy_setopt(h, CURLOPT_WRITEDATA, page); curl_easy_setopt(h, CURLOPT_WRITEDATA, page);
curl_easy_setopt(h, CURLOPT_PRIVATE, page);
curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
curl_easy_setopt(handle, CURLOPT_USERAGENT, "firefox");
curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
curl_multi_add_handle(cm, h); curl_multi_add_handle(cm, h);
++(*left); ++(*left);
} }
Utils::FileUtils::file_body *multi_download (char **urls, int count, int max_parrallel) size_t follow_links(CURLM *cm, Utils::FileUtils::file_body *page, char *url, int *left)
{ {
Utils::FileUtils::file_body *pages = (Utils::FileUtils::file_body *) malloc(count * sizeof(Utils::FileUtils::file_body)); int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
htmlDocPtr doc = htmlReadMemory(page->content, page->size, url, NULL, opts);
if(!doc) {
return 0;
}
xmlChar *xpath = (xmlChar*) "//img/@src";
xmlXPathContextPtr context = xmlXPathNewContext(doc);
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
xmlXPathFreeContext(context);
if(!result) {
return 0;
}
xmlNodeSetPtr nodeset = result->nodesetval;
if(xmlXPathNodeSetIsEmpty(nodeset)) {
xmlXPathFreeObject(result);
return 0;
}
size_t count = 0;
int i;
for(i = 0; i < nodeset->nodeNr; i++) {
double r = rand();
int x = r * nodeset->nodeNr / RAND_MAX;
const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
xmlChar *href = xmlNodeListGetString(doc, node, 1);
// follow relative link
if(true) {
xmlChar *orig = href;
// @todo: consider base= tag which has an impact on relative links
href = xmlBuildURI(href, (xmlChar *) url);
xmlFree(orig);
}
char *link = (char *) href;
if(!link || strlen(link) < 10) {
continue;
}
if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8) || !strncmp(link, "www.", 4)) {
Utils::WebUtils::add_transfer(cm, link, left);
// limit to max 1000 links per page to follow
if(count++ == 1000) {
break;
}
}
xmlFree(link);
}
xmlXPathFreeObject(result);
return count;
}
void *multi_download(
Urls urls,
const char *baseDir,
int max_parrallel = 1,
ResourceTypes *resources = NULL
) {
if (!Utils::WebUtils::CURL_SETUP) { if (!Utils::WebUtils::CURL_SETUP) {
Utils::WebUtils::setup(); Utils::WebUtils::setup();
} }
CURLM *cm = curl_multi_init(); CURLM *cm = curl_multi_init();
curl_multi_setopt(cm, CURLMOPT_MAXCONNECTS, max_parrallel); curl_multi_setopt(cm, CURLMOPT_MAXCONNECTS, max_parrallel);
curl_multi_setopt(cm, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_parrallel);
curl_multi_setopt(cm, CURLMOPT_MAX_HOST_CONNECTIONS, 5);
int downloads; #ifdef CURLPIPE_MULTIPLEX
curl_multi_setopt(cm, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
#endif
int downloads = 0;
int left = 0; int left = 0;
for(downloads = 0; downloads < max_parrallel && downloads < count; ++downloads) { for(downloads = 0; downloads < max_parrallel && downloads < urls.size; ++downloads) {
Utils::WebUtils::add_transfer(cm, urls[downloads], &pages[downloads], &left); Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
} }
CURLMsg *msg; CURLMsg *msg;
@ -139,26 +244,49 @@ namespace Utils
while((msg = curl_multi_info_read(cm, &msgs_left))) { while((msg = curl_multi_info_read(cm, &msgs_left))) {
if(msg->msg == CURLMSG_DONE) { if(msg->msg == CURLMSG_DONE) {
char *url;
CURL *e = msg->easy_handle; CURL *e = msg->easy_handle;
curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &url); char *url;
Utils::FileUtils::file_body *page;
curl_easy_getinfo(e, CURLINFO_PRIVATE, &page);
curl_easy_getinfo(e, CURLINFO_EFFECTIVE_URL, &url);
if (msg->data.result == CURLE_OK) {
long statusCode = 0;
curl_easy_getinfo(e, CURLINFO_RESPONSE_CDE, &statusCode);
if (statusCode == 200) {
char *ctype;
curl_easy_getinfo(e, CURLINFO_CONTENT_TYPE, &ctype);
// @todo: save file (how to handle base resources, either pass base url or save based on url?)
if (ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html")) {
// @todo: check limits
left += follow_links(cm, page, url, &left);
}
}
}
curl_multi_remove_handle(cm, e); curl_multi_remove_handle(cm, e);
curl_easy_cleanup(e); curl_easy_cleanup(e);
free(page->content);
free(page);
--left; --left;
} else { } else {
fprintf(stderr, "E: CURLMsg (%d)\n", msg->msg); fprintf(stderr, "E: CURLMsg (%d)\n", msg->msg);
} }
if(downloads < count) { if(downloads < urls.size) {
++downloads; ++downloads;
add_transfer(cm, urls[downloads], &pages[downloads], &left); Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
} }
} }
if(left > 0) { if(left > 0) {
curl_multi_wait(cm, NULL, 0, 1000, NULL); curl_multi_wait(cm, NULL, 0, 1000, NULL);
} }
} while(left > 0); } while(left > 0);
curl_multi_cleanup(cm); curl_multi_cleanup(cm);

View File

@ -25,6 +25,7 @@ int main(int argc, char** argv)
free(single.content); free(single.content);
/* Multi download */ /* Multi download */
/*
const char *urls[] = { const char *urls[] = {
"https://jingga.app/terms", "https://jingga.app/terms",
"https://jingga.app/imprint", "https://jingga.app/imprint",
@ -44,6 +45,7 @@ int main(int argc, char** argv)
} }
free(multi); free(multi);
*/
Utils::WebUtils::clean(); Utils::WebUtils::clean();
printf("\n\n"); printf("\n\n");