Browser-fast parallel downloads with libcURL
Sample code in plain C about how to download a ton of small files in a fraction of time.
I don't expect this helps someone in the future, but if you ask, honestly, the same code has been tested with bigger files (and by bigger I mean some gigabytes each), and it works with no problems.
// Parallel downloads sample - based on https://curl.haxx.se/libcurl/c/10-at-a-time.html
#include <time.h>
#include <curl/curl.h>
// Available download slots. The smaller the download resources, the higher this value can be.
// Can't be too big or the failing fopen_s() will make some slots impossible to fill.
#define MAX_SIMULTANEOUS_DOWNLOADS 200
typedef struct {
int iIndex;
char *szURL;
char *szPath;
} ProgressHelper;
typedef struct {
unsigned __int64 ui64Size;
char *cData;
} DownloadHelper;
typedef struct {
int iTotalDownloads;
bool *bDownloaded, *bDownloading;
char **szURLs, **szPaths;
FILE **fDownloads;
CURL **curlDownloads;
ProgressHelper *phProgress;
DownloadHelper *dhDownload;
} MultiDownloadHelper;
CURLM *curlMultiHandle;
CURL *curlSharedHandles[MAX_SIMULTANEOUS_DOWNLOADS];
bool bBusyHandles[MAX_SIMULTANEOUS_DOWNLOADS];
const char *szSourceURL = "http://alvein.freevar.com";
const char *szDownloadFolder = "C:\\Users\\Alvein\\Avatars";
static size_t write_callback(char *data, size_t size, size_t nitems, void *userdata) {
// write_callback(): receives incoming download data and "saves" it in a DownloadHelper structure.
unsigned __int64 ui64DataSize = size * nitems;
DownloadHelper *dhCurrentDownload = (DownloadHelper *)userdata;
char *cDownloadedData = (char *)realloc(dhCurrentDownload->cData,
dhCurrentDownload->ui64Size + ui64DataSize);
if(NULL!= cDownloadedData) {
// Saves the downloaded chunk (data) at the end of the downloaded data (cDownloadedData)
if (0 == memcpy_s(cDownloadedData + dhCurrentDownload->ui64Size,
dhCurrentDownload->ui64Size + ui64DataSize,
data,
ui64DataSize)) {
dhCurrentDownload->cData = cDownloadedData;
dhCurrentDownload->ui64Size += ui64DataSize;
return ui64DataSize;
}
}
return 0;
}
static int progress_callback(void *userdata, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) {
// progress_callback(): just a simple callback for future use.
ProgressHelper *phCurrentDownload = (ProgressHelper *)userdata;
if(dltotal)
fprintf(stderr,"%s: %lld of %lld\n", phCurrentDownload->szURL, dlnow, dltotal);
return CURL_PROGRESSFUNC_CONTINUE;
}
bool singleDownload(const char *szURL, char **cContentData, unsigned __int64 *ui64ContentLength) {
// singleDownload(): downloads the resource in szURL.
// cContentData: returned array of bytes (not a string). Must be released by caller.
// ui64ContentLength: the content length written in cContentData.
bool bResult = false;
CURL *curlHandle;
DownloadHelper dhSingle = { 0,NULL };
*cContentData = NULL;
*ui64ContentLength = 0;
curlHandle = curl_easy_init();
if (NULL != curlHandle) {
curl_easy_setopt(curlHandle, CURLOPT_URL, szURL);
curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, &dhSingle);
if (CURLE_OK == curl_easy_perform(curlHandle))
if (dhSingle.ui64Size) {
*cContentData = dhSingle.cData;
*ui64ContentLength = dhSingle.ui64Size;
bResult = true;
}
}
curl_easy_cleanup(curlHandle);
return bResult;
}
bool multiDownload_StartOne(MultiDownloadHelper *mdhHelper, int iIndex) {
// multiDownload_StartOne(): adds a given download job to the multi interface
bool bResult = false;
int iK;
FILE *fHandle;
CURL *curlHandle;
if (0 == fopen_s(&fHandle, mdhHelper->szPaths[iIndex], "wb")) {
// Finds a free download slot
for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++)
if (!bBusyHandles[iK])
break;
if (iK < MAX_SIMULTANEOUS_DOWNLOADS) {
curlHandle = curlSharedHandles[iK];
bBusyHandles[iK] = true; // Seizes the download slot
mdhHelper->fDownloads[iIndex] = fHandle;
mdhHelper->curlDownloads[iIndex] = curlHandle; // Assigns the shared handle to this job
mdhHelper->phProgress[iIndex] = { iIndex,mdhHelper->szURLs[iIndex],mdhHelper->szPaths[iIndex] };
mdhHelper->dhDownload[iIndex] = { 0,NULL }; // Resets the download progress
curl_easy_setopt(curlHandle, CURLOPT_URL, mdhHelper->szURLs[iIndex]);
curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, write_callback);
curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, &mdhHelper->dhDownload[iIndex]);
#ifdef _DEBUG // Progress is disabled in Release - too much stuff on the console
curl_easy_setopt(curlHandle, CURLOPT_NOPROGRESS, 0L);
curl_easy_setopt(curlHandle, CURLOPT_XFERINFOFUNCTION, progress_callback);
curl_easy_setopt(curlHandle, CURLOPT_XFERINFODATA, &mdhHelper->phProgress[iIndex]);
fprintf(stderr, "multiDownload_StartOne(%d)...\n", iIndex);
#endif
curl_multi_add_handle(curlMultiHandle, curlHandle);
bResult = true;
}
}
return bResult;
}
void multiDownload(MultiDownloadHelper *mdhHelper) {
// multiDownload(): performs all the download jobs contained in mdhHelper.
int iK, iJ, iActiveDownloads, iTotalDownloaded, iActiveHandles, iPendingMessages;
CURLMsg *curlMessage;
// Finds every not-completed/not-busy download job...
iActiveDownloads = iTotalDownloaded = 0;
for (; iActiveDownloads < MAX_SIMULTANEOUS_DOWNLOADS; iActiveDownloads++) {
for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
if (!mdhHelper->bDownloaded[iK])
if (!mdhHelper->bDownloading[iK])
break;
if (iK < mdhHelper->iTotalDownloads)
mdhHelper->bDownloading[iK] = multiDownload_StartOne(mdhHelper, iK); // ...and starts them...
else
break;
} // ...as long as there are no more than MAX_SIMULTANEOUS_DOWNLOADS active jobs
do {
curl_multi_perform(curlMultiHandle, &iActiveHandles);
do {
curlMessage = curl_multi_info_read(curlMultiHandle, &iPendingMessages);
if (NULL != curlMessage) {
// Finds the index of the download job the received message belongs to
for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
if (curlMessage->easy_handle == mdhHelper->curlDownloads[iK])
break;
if (iK < mdhHelper->iTotalDownloads) {
if (CURLMSG_DONE == curlMessage->msg) {
if (CURLE_OK == curlMessage->data.result) {
long lResCode;
curl_easy_getinfo(mdhHelper->curlDownloads[iK], CURLINFO_RESPONSE_CODE, &lResCode);
// The response code is ignored in this sample (let's assume it's always HTTP 200 OK)
mdhHelper->bDownloaded[iK] = true;
mdhHelper->bDownloading[iK] = false;
iTotalDownloaded++;
fwrite(mdhHelper->dhDownload[iK].cData,
sizeof(char),
mdhHelper->dhDownload[iK].ui64Size,
mdhHelper->fDownloads[iK]); // Saves the downloaded file in a single shot
#ifdef _DEBUG
fprintf(stderr, "\nDownload is complete (%ld): %s\n", lResCode, mdhHelper->szPaths[iK]);
#endif
}
else {
fprintf(stderr, "\n**Download failed (%d): %s\n", curlMessage->data.result, mdhHelper->szPaths[iK]);
mdhHelper->bDownloading[iK] = false;
}
fclose(mdhHelper->fDownloads[iK]);
mdhHelper->fDownloads[iK] = NULL;
curl_multi_remove_handle(curlMultiHandle, mdhHelper->curlDownloads[iK]);
// Instead of calling curl_easy_cleanup(mdhHelper->curlDownloads[iK])...
for (iJ = 0; iJ < MAX_SIMULTANEOUS_DOWNLOADS; iJ++)
if (curlSharedHandles[iJ] == mdhHelper->curlDownloads[iK])
break;
bBusyHandles[iJ] = false; // ...frees the associated download slot...
mdhHelper->curlDownloads[iK] = NULL; // ...where mdhHelper->curlDownloads[iK] is in
iActiveDownloads--;
if (iTotalDownloaded < mdhHelper->iTotalDownloads) {
// Finds all the pending download jobs, and starts them...
for (; iActiveDownloads < MAX_SIMULTANEOUS_DOWNLOADS; iActiveDownloads++) {
for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
if (!mdhHelper->bDownloaded[iK])
if (!mdhHelper->bDownloading[iK])
break;
if (iK < mdhHelper->iTotalDownloads)
mdhHelper->bDownloading[iK] = multiDownload_StartOne(mdhHelper, iK);
else
break;
} // ...as long as there are no more than MAX_SIMULTANEOUS_DOWNLOADS active jobs
}
}
else // Improbable to happen
fprintf(stderr, "\n!!Unknown message (%d): %s\n", curlMessage->msg, mdhHelper->szPaths[iK]);;
}
else // Impossible to happen
fprintf(stderr, "\n!!Could not find the messaging handle in the downloads list\n");
}
} while (NULL != curlMessage);
if (iActiveHandles) // Gives one second to the active and non responsive downloads...
curl_multi_wait(curlMultiHandle, NULL, 0, 1000, NULL); // ...before continuing the messages poll
else
if (iTotalDownloaded == mdhHelper->iTotalDownloads)
break; // Exits if every download job has finished
} while (true);
}
void allocMultiDownloadHelper(MultiDownloadHelper *mdhHelper, int iHowMany) {
// allocMultiDownloadHelper(): allocates the required memory for every download job.
mdhHelper->iTotalDownloads = iHowMany;
mdhHelper->bDownloaded = (bool *)malloc(iHowMany * sizeof(bool));
mdhHelper->bDownloading = (bool *)malloc(iHowMany * sizeof(bool));
mdhHelper->szURLs = (char **)malloc(iHowMany * sizeof(char *));
mdhHelper->szPaths = (char **)malloc(iHowMany * sizeof(char *));
mdhHelper->fDownloads = (FILE **)malloc(iHowMany * sizeof(FILE *));
mdhHelper->curlDownloads = (CURL **)malloc(iHowMany * sizeof(CURL *));
mdhHelper->phProgress = (ProgressHelper *)malloc(iHowMany * sizeof(ProgressHelper));
mdhHelper->dhDownload = (DownloadHelper *)malloc(iHowMany * sizeof(DownloadHelper));
}
void freeMultiDownloadHelper(MultiDownloadHelper mdhHelper) {
// freeMultiDownloadHelper(): releases the memory allocated for every download job.
for (int iK = 0; iK < mdhHelper.iTotalDownloads; iK++) {
free(mdhHelper.szURLs[iK]);
free(mdhHelper.szPaths[iK]);
free(mdhHelper.dhDownload[iK].cData);
}
free(mdhHelper.bDownloaded);
free(mdhHelper.bDownloading);
free(mdhHelper.szURLs);
free(mdhHelper.szPaths);
free(mdhHelper.fDownloads);
free(mdhHelper.curlDownloads);
free(mdhHelper.phProgress);
free(mdhHelper.dhDownload);
}
void parseHTMLImgTags(char *szHTML, char ***szImgSources, int *iTotal) {
// parseHTMLImgTags(): shameless <img> tags parsing in the HTML content supplied in szHTML.
// Not to be taken seriously.
// szImgSources: returned array of URLs as NULL-terminated strings.
// iTotal: the number of image URLs found.
unsigned __int64 ui64ImgSrcLen;
char *szHTMLNdx, *szImgSrc, **szRllSources,
*szImgTagStart, *szImgTagEnd, *szSrcAttStart, *szSrcAttEnd;
*iTotal = 0;
*szImgSources = NULL;
szHTMLNdx = szHTML;
do {
szImgTagStart = strstr(szHTMLNdx, "<img ");
if (NULL != szImgTagStart) {
szImgTagEnd = strstr(szImgTagStart + 5, ">");
if (NULL != szImgTagEnd) {
szSrcAttStart = strstr(szImgTagStart, "src=\"");
if (NULL != szSrcAttStart) {
szSrcAttEnd = strstr(szSrcAttStart + 5, "\"");
if (NULL != szSrcAttEnd) {
ui64ImgSrcLen = szSrcAttEnd - szSrcAttStart - 5;
szImgSrc = (char *)malloc(ui64ImgSrcLen + 1);
if (0 == strncpy_s(szImgSrc, ui64ImgSrcLen + 1, szSrcAttStart + 5, ui64ImgSrcLen)) {
szImgSrc[ui64ImgSrcLen] = '\0';
szRllSources = (char **)realloc(*szImgSources, (*iTotal + 1) * sizeof(char *));
if (NULL != szRllSources) {
*szImgSources = szRllSources;
(*szImgSources)[(*iTotal)++] = _strdup(szImgSrc);
}
}
free(szImgSrc);
}
}
}
szHTMLNdx = szImgTagEnd + 1;
}
} while (NULL != szImgTagStart);
}
int main(void) {
int iResult = EXIT_FAILURE, iK, iTotalDownloads;
unsigned __int64 ui64HTMLSize;
char *cHTML, *szImgExt, **szURLs, szLocalFile[MAX_PATH];
double dblElapsed;
time_t tmTimer;
FILE *fHTML;
MultiDownloadHelper mdhDownloads;
curl_global_init(CURL_GLOBAL_ALL);
time(&tmTimer);
// Downloads the source web page
if (singleDownload(szSourceURL, &cHTML, &ui64HTMLSize)) {
dblElapsed = difftime(time(NULL), tmTimer);
iTotalDownloads = 0;
szURLs = NULL;
sprintf_s(szLocalFile, MAX_PATH, "%s\\source.html", szDownloadFolder);
(void)fopen_s(&fHTML, szLocalFile, "w");
if (ui64HTMLSize) {
// Saves the content in the download folder
fwrite(cHTML, sizeof(char), ui64HTMLSize, fHTML);
cHTML = (char *)realloc(cHTML, ui64HTMLSize + 1);
if (NULL != cHTML) {
cHTML[ui64HTMLSize] = '\0'; // Assumes the content is HTML - handles it as ASCIIz
parseHTMLImgTags(cHTML, &szURLs, &iTotalDownloads);
}
}
fclose(fHTML);
free(cHTML);
if (iTotalDownloads) {
// Initializes every handle in the download slots - sets them as "available"
for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++) {
curlSharedHandles[iK] = curl_easy_init();
bBusyHandles[iK] = false;
}
allocMultiDownloadHelper(&mdhDownloads, iTotalDownloads);
// Initializes the download jobs (1 per image resource)
for (iK = 0; iK < iTotalDownloads; iK++) {
#ifdef _DEBUG
fprintf(stderr, "Image resource: %s\n", szURLs[iK]);
#endif
mdhDownloads.bDownloaded[iK] = mdhDownloads.bDownloading[iK] = false;
mdhDownloads.szURLs[iK] = szURLs[iK];
// Makes the local filename for each job - just a numeric sequence, for simplicity
mdhDownloads.szPaths[iK] = (char *)malloc(MAX_PATH * sizeof(char));
sprintf_s(mdhDownloads.szPaths[iK], MAX_PATH, "%s\\%05u", szDownloadFolder, iK);
// Adds a file extension, based on the image resource URL - rudimentary method
szImgExt = strrchr(szURLs[iK], '.');
if (NULL != szImgExt)
if (szImgExt == strstr(szImgExt, ".jpg"))
strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".jpg");
else if (szImgExt == strstr(szImgExt, ".png"))
strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".png");
else if (szImgExt == strstr(szImgExt, ".gif"))
strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".gif");
else
strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".tmp");
else
strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".tmp");
}
curlMultiHandle = curl_multi_init();
curl_multi_setopt(curlMultiHandle, CURLMOPT_MAXCONNECTS, MAX_SIMULTANEOUS_DOWNLOADS);
fprintf(stderr, "Downloading %d images...\n", iTotalDownloads);
time(&tmTimer);
multiDownload(&mdhDownloads);
dblElapsed += difftime(time(NULL), tmTimer);
curl_multi_cleanup(curlMultiHandle);
freeMultiDownloadHelper(mdhDownloads);
for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++)
curl_easy_cleanup(curlSharedHandles[iK]);
fprintf(stderr, "Load time: %0.2f\n", dblElapsed);
iResult = EXIT_SUCCESS;
}
else
fprintf(stderr, "Could not find a single image resource the source web page\n");
}
else
fprintf(stderr, "Could not download the source web page\n");
curl_global_cleanup();
return iResult;
}