update download behaviour

2026-01-23 00:08:40 +00:00 · 2022-12-29 17:48:03 +01:00 · 2022-12-29 17:48:03 +01:00 · 30d84d59bb
commit 30d84d59bb
parent f8878d757a
2 changed files with 143 additions and 13 deletions
--- a/Utils/WebUtils.h
+++ b/Utils/WebUtils.h
@ -12,8 +12,14 @@

 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <signal.h>

 #include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <libxml/uri.h>

 #include "FileUtils.h"

@ -74,7 +80,16 @@ namespace Utils
            return out->size;
        }

-        inline
+        typedef struct {
+            size_t size = 0;
+            const char **resources = NULL;
+        } ResourceTypes;
+
+        typedef struct {
+            size_t size = 0;
+            const char **urls = NULL;
+        } Urls;
+
        Utils::FileUtils::file_body download (char *url)
        {
            Utils::FileUtils::file_body page = {0};
@ -84,6 +99,7 @@ namespace Utils
            }

            CURL *h = curl_easy_init();
+
            curl_easy_setopt(h, CURLOPT_URL, url);
            curl_easy_setopt(h, CURLOPT_PRIVATE, url);
            curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
@ -97,37 +113,126 @@ namespace Utils
        }

        inline
-        void add_transfer(CURLM *cm, char *url, Utils::FileUtils::file_body *page, int *left)
+        void add_transfer(CURLM *cm, char *url, int *left)
        {
            CURL *h = curl_easy_init();
+
+            Utils::FileUtils::file_body *page = (Utils::FileUtils::file_body *) malloc(sizeof(Utils::FileUtils::file_body));
+            page->size = 0;
+
            curl_easy_setopt(h, CURLOPT_URL, url);
-            curl_easy_setopt(h, CURLOPT_PRIVATE, url);
            curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
            curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
            curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data);
            curl_easy_setopt(h, CURLOPT_WRITEDATA, page);
+            curl_easy_setopt(h, CURLOPT_PRIVATE, page);
+
+            curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
+            curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
+            curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
+            curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
+            curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
+            curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
+            curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
+            curl_easy_setopt(handle, CURLOPT_USERAGENT, "firefox");
+            curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
+            curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
+            curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
+            curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);

            curl_multi_add_handle(cm, h);

            ++(*left);
        }

-        Utils::FileUtils::file_body *multi_download (char **urls, int count, int max_parrallel)
+        size_t follow_links(CURLM *cm, Utils::FileUtils::file_body *page, char *url, int *left)
        {
-            Utils::FileUtils::file_body *pages = (Utils::FileUtils::file_body *) malloc(count * sizeof(Utils::FileUtils::file_body));
+            int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR |  HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
+            htmlDocPtr doc = htmlReadMemory(page->content, page->size, url, NULL, opts);

+            if(!doc) {
+                return 0;
+            }
+
+            xmlChar *xpath = (xmlChar*) "//img/@src";
+            xmlXPathContextPtr context = xmlXPathNewContext(doc);
+            xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
+            xmlXPathFreeContext(context);
+
+            if(!result) {
+                return 0;
+            }
+
+            xmlNodeSetPtr nodeset = result->nodesetval;
+            if(xmlXPathNodeSetIsEmpty(nodeset)) {
+                xmlXPathFreeObject(result);
+                return 0;
+            }
+
+            size_t count = 0;
+            int i;
+
+            for(i = 0; i < nodeset->nodeNr; i++) {
+                double r = rand();
+                int x = r * nodeset->nodeNr / RAND_MAX;
+                const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
+                xmlChar *href = xmlNodeListGetString(doc, node, 1);
+
+                // follow relative link
+                if(true) {
+                    xmlChar *orig = href;
+
+                    // @todo: consider base= tag which has an impact on relative links
+                    href = xmlBuildURI(href, (xmlChar *) url);
+                    xmlFree(orig);
+                }
+
+                char *link = (char *) href;
+                if(!link || strlen(link) < 10) {
+                    continue;
+                }
+
+                if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8) || !strncmp(link, "www.", 4)) {
+                    Utils::WebUtils::add_transfer(cm, link, left);
+
+                    // limit to max 1000 links per page to follow
+                    if(count++ == 1000) {
+                        break;
+                    }
+                }
+
+                xmlFree(link);
+            }
+
+            xmlXPathFreeObject(result);
+
+            return count;
+        }
+
+        void *multi_download(
+            Urls urls,
+            const char *baseDir,
+            int max_parrallel = 1,
+            ResourceTypes *resources = NULL
+        ) {
            if (!Utils::WebUtils::CURL_SETUP) {
                Utils::WebUtils::setup();
            }

            CURLM *cm = curl_multi_init();
            curl_multi_setopt(cm, CURLMOPT_MAXCONNECTS, max_parrallel);
+            curl_multi_setopt(cm, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_parrallel);
+            curl_multi_setopt(cm, CURLMOPT_MAX_HOST_CONNECTIONS, 5);

-            int downloads;
+            #ifdef CURLPIPE_MULTIPLEX
+              curl_multi_setopt(cm, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
+            #endif
+
+            int downloads = 0;
            int left = 0;

-            for(downloads = 0; downloads < max_parrallel && downloads < count; ++downloads) {
-                Utils::WebUtils::add_transfer(cm, urls[downloads], &pages[downloads], &left);
+            for(downloads = 0; downloads < max_parrallel && downloads < urls.size; ++downloads) {
+                Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
            }

            CURLMsg *msg;
@ -139,26 +244,49 @@ namespace Utils

                while((msg = curl_multi_info_read(cm, &msgs_left))) {
                    if(msg->msg == CURLMSG_DONE) {
-                        char *url;
                        CURL *e = msg->easy_handle;
-                        curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &url);
+                        char *url;
+                        Utils::FileUtils::file_body *page;
+
+                        curl_easy_getinfo(e, CURLINFO_PRIVATE, &page);
+                        curl_easy_getinfo(e, CURLINFO_EFFECTIVE_URL, &url);
+
+                        if (msg->data.result == CURLE_OK) {
+                            long statusCode = 0;
+                            curl_easy_getinfo(e, CURLINFO_RESPONSE_CDE, &statusCode);
+
+                            if (statusCode == 200) {
+                                char *ctype;
+                                curl_easy_getinfo(e, CURLINFO_CONTENT_TYPE, &ctype);
+
+                                // @todo: save file (how to handle base resources, either pass base url or save based on url?)
+
+                                if (ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html")) {
+                                    // @todo: check limits
+                                    left += follow_links(cm, page, url, &left);
+                                }
+                            }
+                        }
+
                        curl_multi_remove_handle(cm, e);
                        curl_easy_cleanup(e);
+                        free(page->content);
+                        free(page);
+
                        --left;
                    } else {
                        fprintf(stderr, "E: CURLMsg (%d)\n", msg->msg);
                    }

-                    if(downloads < count) {
+                    if(downloads < urls.size) {
                        ++downloads;
-                        add_transfer(cm, urls[downloads], &pages[downloads], &left);
+                        Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
                    }
                }

                if(left > 0) {
                    curl_multi_wait(cm, NULL, 0, 1000, NULL);
                }
-
            } while(left > 0);

            curl_multi_cleanup(cm);
--- a/tests/Utils/WebUtilsTest.cpp
+++ b/tests/Utils/WebUtilsTest.cpp
@ -25,6 +25,7 @@ int main(int argc, char** argv)
    free(single.content);

    /* Multi download */
+    /*
    const char *urls[] = {
        "https://jingga.app/terms",
        "https://jingga.app/imprint",
@ -44,6 +45,7 @@ int main(int argc, char** argv)
    }

    free(multi);
+    */

    Utils::WebUtils::clean();
    printf("\n\n");