mirror of
https://github.com/Karaka-Management/cOMS.git
synced 2026-01-23 08:18:41 +00:00
299 lines
9.8 KiB
C++
Executable File
299 lines
9.8 KiB
C++
Executable File
/**
|
|
* Karaka
|
|
*
|
|
* @package Utils
|
|
* @copyright Dennis Eichhorn
|
|
* @license OMS License 1.0
|
|
* @version 1.0.0
|
|
* @link https://jingga.app
|
|
*/
|
|
#ifndef UTILS_WEB_UTILS_H
|
|
#define UTILS_WEB_UTILS_H
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
#include <signal.h>
|
|
|
|
#include <curl/curl.h>
|
|
#include <libxml/HTMLparser.h>
|
|
#include <libxml/xpath.h>
|
|
#include <libxml/uri.h>
|
|
|
|
#include "FileUtils.h"
|
|
|
|
namespace Utils
|
|
{
|
|
namespace WebUtils
|
|
{
|
|
static bool CURL_SETUP = false;
|
|
|
|
inline
|
|
void setup()
|
|
{
|
|
curl_global_init(CURL_GLOBAL_DEFAULT);
|
|
Utils::WebUtils::CURL_SETUP = true;
|
|
}
|
|
|
|
inline
|
|
void clean()
|
|
{
|
|
curl_global_cleanup();
|
|
Utils::WebUtils::CURL_SETUP = false;
|
|
}
|
|
|
|
int write_download_data (void *ptr, size_t size, size_t nmeb, void *stream)
|
|
{
|
|
Utils::FileUtils::file_body *out = (Utils::FileUtils::file_body *) stream;
|
|
size_t outSize = size * nmeb;
|
|
|
|
if (out->size == 0) {
|
|
// first time this function is called for a specific resource
|
|
out->content = (char *) malloc((outSize + 1) * sizeof(char));
|
|
if (!out->content) {
|
|
fprintf(stderr, "CRITICAL: malloc failed");
|
|
|
|
return 0;
|
|
}
|
|
|
|
if (out->content) {
|
|
memcpy(out->content, ptr, outSize * sizeof(char));
|
|
|
|
out->size = (int) outSize;
|
|
out->content[out->size] = 0;
|
|
}
|
|
} else {
|
|
// the max buffer (16384 = 16k) is exceeded, then this is called again and needs to get extended
|
|
char *temp = (char *) malloc((outSize + out->size + 1) * sizeof(char));
|
|
|
|
memcpy(temp, out->content, out->size * sizeof(char));
|
|
memcpy(temp + out->size * sizeof(char), ptr, outSize * sizeof(char));
|
|
|
|
free(out->content);
|
|
|
|
out->content = temp;
|
|
out->size += outSize;
|
|
out->content[out->size] = 0;
|
|
}
|
|
|
|
return out->size;
|
|
}
|
|
|
|
typedef struct {
|
|
size_t size = 0;
|
|
const char **resources = NULL;
|
|
} ResourceTypes;
|
|
|
|
typedef struct {
|
|
size_t size = 0;
|
|
const char **urls = NULL;
|
|
} Urls;
|
|
|
|
Utils::FileUtils::file_body download (char *url)
|
|
{
|
|
Utils::FileUtils::file_body page = {0};
|
|
|
|
if (!Utils::WebUtils::CURL_SETUP) {
|
|
Utils::WebUtils::setup();
|
|
}
|
|
|
|
CURL *h = curl_easy_init();
|
|
|
|
curl_easy_setopt(h, CURLOPT_URL, url);
|
|
curl_easy_setopt(h, CURLOPT_PRIVATE, url);
|
|
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data);
|
|
curl_easy_setopt(h, CURLOPT_WRITEDATA, &page);
|
|
curl_easy_perform(h);
|
|
curl_easy_cleanup(h);
|
|
|
|
return page;
|
|
}
|
|
|
|
inline
|
|
void add_transfer(CURLM *cm, char *url, int *left)
|
|
{
|
|
CURL *h = curl_easy_init();
|
|
|
|
Utils::FileUtils::file_body *page = (Utils::FileUtils::file_body *) malloc(sizeof(Utils::FileUtils::file_body));
|
|
page->size = 0;
|
|
|
|
curl_easy_setopt(h, CURLOPT_URL, url);
|
|
curl_easy_setopt(h, CURLOPT_SSL_VERIFYPEER, 0L);
|
|
curl_easy_setopt(h, CURLOPT_SSL_VERIFYHOST, 0L);
|
|
curl_easy_setopt(h, CURLOPT_WRITEFUNCTION, write_download_data);
|
|
curl_easy_setopt(h, CURLOPT_WRITEDATA, page);
|
|
curl_easy_setopt(h, CURLOPT_PRIVATE, page);
|
|
|
|
curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
|
|
curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
|
|
curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
|
|
curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
|
|
curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
|
|
curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
|
|
curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
|
|
curl_easy_setopt(handle, CURLOPT_USERAGENT, "firefox");
|
|
curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
|
|
curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
|
|
curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
|
|
curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
|
|
|
|
curl_multi_add_handle(cm, h);
|
|
|
|
++(*left);
|
|
}
|
|
|
|
size_t follow_links(CURLM *cm, Utils::FileUtils::file_body *page, char *url, int *left)
|
|
{
|
|
int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
|
|
htmlDocPtr doc = htmlReadMemory(page->content, page->size, url, NULL, opts);
|
|
|
|
if(!doc) {
|
|
return 0;
|
|
}
|
|
|
|
xmlChar *xpath = (xmlChar*) "//img/@src";
|
|
xmlXPathContextPtr context = xmlXPathNewContext(doc);
|
|
xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
|
|
xmlXPathFreeContext(context);
|
|
|
|
if(!result) {
|
|
return 0;
|
|
}
|
|
|
|
xmlNodeSetPtr nodeset = result->nodesetval;
|
|
if(xmlXPathNodeSetIsEmpty(nodeset)) {
|
|
xmlXPathFreeObject(result);
|
|
return 0;
|
|
}
|
|
|
|
size_t count = 0;
|
|
int i;
|
|
|
|
for(i = 0; i < nodeset->nodeNr; i++) {
|
|
double r = rand();
|
|
int x = r * nodeset->nodeNr / RAND_MAX;
|
|
const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
|
|
xmlChar *href = xmlNodeListGetString(doc, node, 1);
|
|
|
|
// follow relative link
|
|
if(true) {
|
|
xmlChar *orig = href;
|
|
|
|
// @todo: consider base= tag which has an impact on relative links
|
|
href = xmlBuildURI(href, (xmlChar *) url);
|
|
xmlFree(orig);
|
|
}
|
|
|
|
char *link = (char *) href;
|
|
if(!link || strlen(link) < 10) {
|
|
continue;
|
|
}
|
|
|
|
if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8) || !strncmp(link, "www.", 4)) {
|
|
Utils::WebUtils::add_transfer(cm, link, left);
|
|
|
|
// limit to max 1000 links per page to follow
|
|
if(count++ == 1000) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
xmlFree(link);
|
|
}
|
|
|
|
xmlXPathFreeObject(result);
|
|
|
|
return count;
|
|
}
|
|
|
|
void *multi_download(
|
|
Urls urls,
|
|
const char *baseDir,
|
|
int max_parrallel = 1,
|
|
ResourceTypes *resources = NULL
|
|
) {
|
|
if (!Utils::WebUtils::CURL_SETUP) {
|
|
Utils::WebUtils::setup();
|
|
}
|
|
|
|
CURLM *cm = curl_multi_init();
|
|
curl_multi_setopt(cm, CURLMOPT_MAXCONNECTS, max_parrallel);
|
|
curl_multi_setopt(cm, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_parrallel);
|
|
curl_multi_setopt(cm, CURLMOPT_MAX_HOST_CONNECTIONS, 5);
|
|
|
|
#ifdef CURLPIPE_MULTIPLEX
|
|
curl_multi_setopt(cm, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
|
|
#endif
|
|
|
|
int downloads = 0;
|
|
int left = 0;
|
|
|
|
for(downloads = 0; downloads < max_parrallel && downloads < urls.size; ++downloads) {
|
|
Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
|
|
}
|
|
|
|
CURLMsg *msg;
|
|
int msgs_left = -1;
|
|
|
|
do {
|
|
int alive = 1;
|
|
curl_multi_perform(cm, &alive);
|
|
|
|
while((msg = curl_multi_info_read(cm, &msgs_left))) {
|
|
if(msg->msg == CURLMSG_DONE) {
|
|
CURL *e = msg->easy_handle;
|
|
char *url;
|
|
Utils::FileUtils::file_body *page;
|
|
|
|
curl_easy_getinfo(e, CURLINFO_PRIVATE, &page);
|
|
curl_easy_getinfo(e, CURLINFO_EFFECTIVE_URL, &url);
|
|
|
|
if (msg->data.result == CURLE_OK) {
|
|
long statusCode = 0;
|
|
curl_easy_getinfo(e, CURLINFO_RESPONSE_CDE, &statusCode);
|
|
|
|
if (statusCode == 200) {
|
|
char *ctype;
|
|
curl_easy_getinfo(e, CURLINFO_CONTENT_TYPE, &ctype);
|
|
|
|
// @todo: save file (how to handle base resources, either pass base url or save based on url?)
|
|
|
|
if (ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html")) {
|
|
// @todo: check limits
|
|
left += follow_links(cm, page, url, &left);
|
|
}
|
|
}
|
|
}
|
|
|
|
curl_multi_remove_handle(cm, e);
|
|
curl_easy_cleanup(e);
|
|
free(page->content);
|
|
free(page);
|
|
|
|
--left;
|
|
} else {
|
|
fprintf(stderr, "E: CURLMsg (%d)\n", msg->msg);
|
|
}
|
|
|
|
if(downloads < urls.size) {
|
|
++downloads;
|
|
Utils::WebUtils::add_transfer(cm, urls.urls[downloads], &left);
|
|
}
|
|
}
|
|
|
|
if(left > 0) {
|
|
curl_multi_wait(cm, NULL, 0, 1000, NULL);
|
|
}
|
|
} while(left > 0);
|
|
|
|
curl_multi_cleanup(cm);
|
|
|
|
return pages;
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif |