見出し画像

Browser-fast parallel downloads with libcURL

Sample code in plain C about how to download a ton of small files in a fraction of time.

I don't expect this helps someone in the future, but if you ask, honestly, the same code has been tested with bigger files (and by bigger I mean some gigabytes each), and it works with no problems.

// Parallel downloads sample - based on https://curl.haxx.se/libcurl/c/10-at-a-time.html

#include <time.h>
#include <curl/curl.h>

// Available download slots. The smaller the download resources, the higher this value can be.
// Can't be too big or the failing fopen_s() will make some slots impossible to fill.
#define MAX_SIMULTANEOUS_DOWNLOADS 200

typedef struct {
   int  iIndex;
   char *szURL;
   char *szPath;
} ProgressHelper;

typedef struct {
   unsigned __int64 ui64Size;
   char             *cData;
} DownloadHelper;

typedef struct {
   int            iTotalDownloads;
   bool           *bDownloaded, *bDownloading;
   char           **szURLs, **szPaths;
   FILE           **fDownloads;
   CURL           **curlDownloads;
   ProgressHelper *phProgress;
   DownloadHelper *dhDownload;
} MultiDownloadHelper;

CURLM *curlMultiHandle;

CURL *curlSharedHandles[MAX_SIMULTANEOUS_DOWNLOADS];

bool bBusyHandles[MAX_SIMULTANEOUS_DOWNLOADS];

const char *szSourceURL = "http://alvein.freevar.com";
const char *szDownloadFolder = "C:\\Users\\Alvein\\Avatars";

static size_t write_callback(char *data, size_t size, size_t nitems, void *userdata) {
// write_callback(): receives incoming download data and "saves" it in a DownloadHelper structure.
   unsigned __int64  ui64DataSize = size * nitems;
   DownloadHelper    *dhCurrentDownload = (DownloadHelper *)userdata;
   char              *cDownloadedData = (char *)realloc(dhCurrentDownload->cData,
                                                        dhCurrentDownload->ui64Size + ui64DataSize);
   if(NULL!= cDownloadedData) {
       // Saves the downloaded chunk (data) at the end of the downloaded data (cDownloadedData)
       if (0 == memcpy_s(cDownloadedData + dhCurrentDownload->ui64Size,
                         dhCurrentDownload->ui64Size + ui64DataSize,
                         data,
                         ui64DataSize)) {
           dhCurrentDownload->cData = cDownloadedData;
           dhCurrentDownload->ui64Size += ui64DataSize;
           return ui64DataSize;
       }
   }
   return 0;
}

static int progress_callback(void *userdata, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow) {
// progress_callback(): just a simple callback for future use.
   ProgressHelper *phCurrentDownload = (ProgressHelper *)userdata;
   if(dltotal)
       fprintf(stderr,"%s: %lld of %lld\n", phCurrentDownload->szURL, dlnow, dltotal);
   return CURL_PROGRESSFUNC_CONTINUE;
}

bool singleDownload(const char *szURL, char **cContentData, unsigned __int64 *ui64ContentLength) {
// singleDownload():  downloads the resource in szURL.
// cContentData:      returned array of bytes (not a string). Must be released by caller.
// ui64ContentLength: the content length written in cContentData.
   bool           bResult = false;
   CURL           *curlHandle;
   DownloadHelper dhSingle = { 0,NULL };
   *cContentData = NULL;
   *ui64ContentLength = 0;
   curlHandle = curl_easy_init();
   if (NULL != curlHandle) {
       curl_easy_setopt(curlHandle, CURLOPT_URL, szURL);
       curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, write_callback);
       curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, &dhSingle);
       if (CURLE_OK == curl_easy_perform(curlHandle))
           if (dhSingle.ui64Size) {
               *cContentData = dhSingle.cData;
               *ui64ContentLength = dhSingle.ui64Size;
               bResult = true;
           }
   }
   curl_easy_cleanup(curlHandle);
   return bResult;
}

bool multiDownload_StartOne(MultiDownloadHelper *mdhHelper, int iIndex) {
// multiDownload_StartOne(): adds a given download job to the multi interface
   bool bResult = false;
   int  iK;
   FILE *fHandle;
   CURL *curlHandle;
   if (0 == fopen_s(&fHandle, mdhHelper->szPaths[iIndex], "wb")) {
       // Finds a free download slot
       for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++)
           if (!bBusyHandles[iK])
               break;
       if (iK < MAX_SIMULTANEOUS_DOWNLOADS) {
           curlHandle = curlSharedHandles[iK];
           bBusyHandles[iK] = true; // Seizes the download slot
           mdhHelper->fDownloads[iIndex] = fHandle;
           mdhHelper->curlDownloads[iIndex] = curlHandle; // Assigns the shared handle to this job
           mdhHelper->phProgress[iIndex] = { iIndex,mdhHelper->szURLs[iIndex],mdhHelper->szPaths[iIndex] };
           mdhHelper->dhDownload[iIndex] = { 0,NULL }; // Resets the download progress
           curl_easy_setopt(curlHandle, CURLOPT_URL, mdhHelper->szURLs[iIndex]);
           curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, write_callback);
           curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, &mdhHelper->dhDownload[iIndex]);
           #ifdef _DEBUG // Progress is disabled in Release - too much stuff on the console
               curl_easy_setopt(curlHandle, CURLOPT_NOPROGRESS, 0L);
               curl_easy_setopt(curlHandle, CURLOPT_XFERINFOFUNCTION, progress_callback);
               curl_easy_setopt(curlHandle, CURLOPT_XFERINFODATA, &mdhHelper->phProgress[iIndex]);
               fprintf(stderr, "multiDownload_StartOne(%d)...\n", iIndex);
           #endif
           curl_multi_add_handle(curlMultiHandle, curlHandle);
           bResult = true;
       }
   }
   return bResult;
}

void multiDownload(MultiDownloadHelper *mdhHelper) {
// multiDownload(): performs all the download jobs contained in mdhHelper.
   int     iK, iJ, iActiveDownloads, iTotalDownloaded, iActiveHandles, iPendingMessages;
   CURLMsg *curlMessage;
   // Finds every not-completed/not-busy download job...
   iActiveDownloads = iTotalDownloaded = 0;
   for (; iActiveDownloads < MAX_SIMULTANEOUS_DOWNLOADS; iActiveDownloads++) {
       for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
           if (!mdhHelper->bDownloaded[iK])
               if (!mdhHelper->bDownloading[iK])
                   break;
       if (iK < mdhHelper->iTotalDownloads)
           mdhHelper->bDownloading[iK] = multiDownload_StartOne(mdhHelper, iK); // ...and starts them...
       else
           break;
   } // ...as long as there are no more than MAX_SIMULTANEOUS_DOWNLOADS active jobs
   do {
       curl_multi_perform(curlMultiHandle, &iActiveHandles);
       do {
           curlMessage = curl_multi_info_read(curlMultiHandle, &iPendingMessages);
           if (NULL != curlMessage) {
               // Finds the index of the download job the received message belongs to
               for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
                   if (curlMessage->easy_handle == mdhHelper->curlDownloads[iK])
                       break;
               if (iK < mdhHelper->iTotalDownloads) {
                   if (CURLMSG_DONE == curlMessage->msg) {
                       if (CURLE_OK == curlMessage->data.result) {
                           long lResCode;
                           curl_easy_getinfo(mdhHelper->curlDownloads[iK], CURLINFO_RESPONSE_CODE, &lResCode);
                           // The response code is ignored in this sample (let's assume it's always HTTP 200 OK)
                           mdhHelper->bDownloaded[iK] = true;
                           mdhHelper->bDownloading[iK] = false;
                           iTotalDownloaded++;
                           fwrite(mdhHelper->dhDownload[iK].cData,
                                  sizeof(char),
                                  mdhHelper->dhDownload[iK].ui64Size,
                                  mdhHelper->fDownloads[iK]); // Saves the downloaded file in a single shot
                           #ifdef _DEBUG
                               fprintf(stderr, "\nDownload is complete (%ld): %s\n", lResCode, mdhHelper->szPaths[iK]);
                           #endif
                       }
                       else {
                           fprintf(stderr, "\n**Download failed (%d): %s\n", curlMessage->data.result, mdhHelper->szPaths[iK]);
                           mdhHelper->bDownloading[iK] = false;
                       }
                       fclose(mdhHelper->fDownloads[iK]);
                       mdhHelper->fDownloads[iK] = NULL;
                       curl_multi_remove_handle(curlMultiHandle, mdhHelper->curlDownloads[iK]);
                       // Instead of calling curl_easy_cleanup(mdhHelper->curlDownloads[iK])...
                       for (iJ = 0; iJ < MAX_SIMULTANEOUS_DOWNLOADS; iJ++)
                           if (curlSharedHandles[iJ] == mdhHelper->curlDownloads[iK])
                               break;
                       bBusyHandles[iJ] = false;            // ...frees the associated download slot...
                       mdhHelper->curlDownloads[iK] = NULL; // ...where mdhHelper->curlDownloads[iK] is in
                       iActiveDownloads--;
                       if (iTotalDownloaded < mdhHelper->iTotalDownloads) {
                           // Finds all the pending download jobs, and starts them...
                           for (; iActiveDownloads < MAX_SIMULTANEOUS_DOWNLOADS; iActiveDownloads++) {
                               for (iK = 0; iK < mdhHelper->iTotalDownloads; iK++)
                                   if (!mdhHelper->bDownloaded[iK])
                                       if (!mdhHelper->bDownloading[iK])
                                           break;
                               if (iK < mdhHelper->iTotalDownloads)
                                   mdhHelper->bDownloading[iK] = multiDownload_StartOne(mdhHelper, iK);
                               else
                                   break;
                           } // ...as long as there are no more than MAX_SIMULTANEOUS_DOWNLOADS active jobs
                       }
                   }
                   else // Improbable to happen
                       fprintf(stderr, "\n!!Unknown message (%d): %s\n", curlMessage->msg, mdhHelper->szPaths[iK]);;
               }
               else // Impossible to happen
                   fprintf(stderr, "\n!!Could not find the messaging handle in the downloads list\n");
           }
       } while (NULL != curlMessage);
       if (iActiveHandles) // Gives one second to the active and non responsive downloads...
           curl_multi_wait(curlMultiHandle, NULL, 0, 1000, NULL); // ...before continuing the messages poll
       else
           if (iTotalDownloaded == mdhHelper->iTotalDownloads)
               break; // Exits if every download job has finished
   } while (true);
}

void allocMultiDownloadHelper(MultiDownloadHelper *mdhHelper, int iHowMany) {
// allocMultiDownloadHelper(): allocates the required memory for every download job.
   mdhHelper->iTotalDownloads = iHowMany;
   mdhHelper->bDownloaded = (bool *)malloc(iHowMany * sizeof(bool));
   mdhHelper->bDownloading = (bool *)malloc(iHowMany * sizeof(bool));
   mdhHelper->szURLs = (char **)malloc(iHowMany * sizeof(char *));
   mdhHelper->szPaths = (char **)malloc(iHowMany * sizeof(char *));
   mdhHelper->fDownloads = (FILE **)malloc(iHowMany * sizeof(FILE *));
   mdhHelper->curlDownloads = (CURL **)malloc(iHowMany * sizeof(CURL *));
   mdhHelper->phProgress = (ProgressHelper *)malloc(iHowMany * sizeof(ProgressHelper));
   mdhHelper->dhDownload = (DownloadHelper *)malloc(iHowMany * sizeof(DownloadHelper));
}

void freeMultiDownloadHelper(MultiDownloadHelper mdhHelper) {
// freeMultiDownloadHelper(): releases the memory allocated for every download job.
   for (int iK = 0; iK < mdhHelper.iTotalDownloads; iK++) {
       free(mdhHelper.szURLs[iK]);
       free(mdhHelper.szPaths[iK]);
       free(mdhHelper.dhDownload[iK].cData);
   }
   free(mdhHelper.bDownloaded);
   free(mdhHelper.bDownloading);
   free(mdhHelper.szURLs);
   free(mdhHelper.szPaths);
   free(mdhHelper.fDownloads);
   free(mdhHelper.curlDownloads);
   free(mdhHelper.phProgress);
   free(mdhHelper.dhDownload);
}

void parseHTMLImgTags(char *szHTML, char ***szImgSources, int *iTotal) {
// parseHTMLImgTags(): shameless <img> tags parsing in the HTML content supplied in szHTML.
//                     Not to be taken seriously.
// szImgSources:       returned array of URLs as NULL-terminated strings.
// iTotal:             the number of image URLs found.
   unsigned __int64 ui64ImgSrcLen;
   char             *szHTMLNdx, *szImgSrc, **szRllSources,
                    *szImgTagStart, *szImgTagEnd, *szSrcAttStart, *szSrcAttEnd;
   *iTotal = 0;
   *szImgSources = NULL;
   szHTMLNdx = szHTML;
   do {
       szImgTagStart = strstr(szHTMLNdx, "<img ");
       if (NULL != szImgTagStart) {
           szImgTagEnd = strstr(szImgTagStart + 5, ">");
           if (NULL != szImgTagEnd) {
               szSrcAttStart = strstr(szImgTagStart, "src=\"");
               if (NULL != szSrcAttStart) {
                   szSrcAttEnd = strstr(szSrcAttStart + 5, "\"");
                   if (NULL != szSrcAttEnd) {
                       ui64ImgSrcLen = szSrcAttEnd - szSrcAttStart - 5;
                       szImgSrc = (char *)malloc(ui64ImgSrcLen + 1);
                       if (0 == strncpy_s(szImgSrc, ui64ImgSrcLen + 1, szSrcAttStart + 5, ui64ImgSrcLen)) {
                           szImgSrc[ui64ImgSrcLen] = '\0';
                           szRllSources = (char **)realloc(*szImgSources, (*iTotal + 1) * sizeof(char *));
                           if (NULL != szRllSources) {
                               *szImgSources = szRllSources;
                               (*szImgSources)[(*iTotal)++] = _strdup(szImgSrc);
                           }
                       }
                       free(szImgSrc);
                   }
               }
           }
           szHTMLNdx = szImgTagEnd + 1;
       }
   } while (NULL != szImgTagStart);
}

int main(void) {
   int                 iResult = EXIT_FAILURE, iK, iTotalDownloads;
   unsigned __int64    ui64HTMLSize;
   char                *cHTML, *szImgExt, **szURLs, szLocalFile[MAX_PATH];
   double              dblElapsed;
   time_t              tmTimer;
   FILE                *fHTML;
   MultiDownloadHelper mdhDownloads;
   curl_global_init(CURL_GLOBAL_ALL);
   time(&tmTimer);
   // Downloads the source web page
   if (singleDownload(szSourceURL, &cHTML, &ui64HTMLSize)) {
       dblElapsed = difftime(time(NULL), tmTimer);
       iTotalDownloads = 0;
       szURLs = NULL;
       sprintf_s(szLocalFile, MAX_PATH, "%s\\source.html", szDownloadFolder);
       (void)fopen_s(&fHTML, szLocalFile, "w");
       if (ui64HTMLSize) {
           // Saves the content in the download folder
           fwrite(cHTML, sizeof(char), ui64HTMLSize, fHTML);
           cHTML = (char *)realloc(cHTML, ui64HTMLSize + 1);
           if (NULL != cHTML) {
               cHTML[ui64HTMLSize] = '\0'; // Assumes the content is HTML - handles it as ASCIIz
               parseHTMLImgTags(cHTML, &szURLs, &iTotalDownloads);
           }
       }
       fclose(fHTML);
       free(cHTML);
       if (iTotalDownloads) {
           // Initializes every handle in the download slots - sets them as "available"
           for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++) {
               curlSharedHandles[iK] = curl_easy_init();
               bBusyHandles[iK] = false;
           }
           allocMultiDownloadHelper(&mdhDownloads, iTotalDownloads);
           // Initializes the download jobs (1 per image resource)
           for (iK = 0; iK < iTotalDownloads; iK++) {
               #ifdef _DEBUG
                   fprintf(stderr, "Image resource: %s\n", szURLs[iK]);
               #endif
               mdhDownloads.bDownloaded[iK] = mdhDownloads.bDownloading[iK] = false;
               mdhDownloads.szURLs[iK] = szURLs[iK];
               // Makes the local filename for each job - just a numeric sequence, for simplicity
               mdhDownloads.szPaths[iK] = (char *)malloc(MAX_PATH * sizeof(char));
               sprintf_s(mdhDownloads.szPaths[iK], MAX_PATH, "%s\\%05u", szDownloadFolder, iK);
               // Adds a file extension, based on the image resource URL - rudimentary method
               szImgExt = strrchr(szURLs[iK], '.');
               if (NULL != szImgExt)
                   if (szImgExt == strstr(szImgExt, ".jpg"))
                       strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".jpg");
                   else if (szImgExt == strstr(szImgExt, ".png"))
                       strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".png");
                   else if (szImgExt == strstr(szImgExt, ".gif"))
                       strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".gif");
                   else
                       strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".tmp");
               else
                   strcat_s(mdhDownloads.szPaths[iK], MAX_PATH, ".tmp");
           }
           curlMultiHandle = curl_multi_init();
           curl_multi_setopt(curlMultiHandle, CURLMOPT_MAXCONNECTS, MAX_SIMULTANEOUS_DOWNLOADS);
           fprintf(stderr, "Downloading %d images...\n", iTotalDownloads);
           time(&tmTimer);
           multiDownload(&mdhDownloads);
           dblElapsed += difftime(time(NULL), tmTimer);
           curl_multi_cleanup(curlMultiHandle);
           freeMultiDownloadHelper(mdhDownloads);
           for (iK = 0; iK < MAX_SIMULTANEOUS_DOWNLOADS; iK++)
               curl_easy_cleanup(curlSharedHandles[iK]);
           fprintf(stderr, "Load time: %0.2f\n", dblElapsed);
           iResult = EXIT_SUCCESS;
       }
       else
           fprintf(stderr, "Could not find a single image resource the source web page\n");
   }
   else
       fprintf(stderr, "Could not download the source web page\n");
   curl_global_cleanup();
   return iResult;
}

いいなと思ったら応援しよう!