見出し画像

7割Copilot君に書いてもらったリサンプリングプログラム(仮)2

経緯を残しているだけなので、(完)の記事が最新です。

CUDA対応で爆速


インストールしとくもの

別途CUDA ToolkitとcuDNNのインストールが必要。
この2つのインストールは面倒なので他の人の記事を見てください。
あとは前回と同じ。

sudo apt update
sudo apt install libsndfile1 libsndfile1-dev

resample.cuで保存

#include <cuda_runtime.h>
#include <iostream>

__global__ void resampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter ) {
    int t = blockIdx.x * blockDim.x + threadIdx.x;
    if (t >= outputFrames) return;

    double resultPos = static_cast<double>(t) / static_cast<double>(oversamplingFactor);
    int i = t % oversamplingFactor;

    for (int c = 0; c < sfinfo_channels; ++c) {
        if (i == 0) {
            outputData[t * sfinfo_channels + c] = inputData[static_cast<int>(resultPos) * sfinfo_channels + c];
        } else {
            double sum = 0.0;
            for (int k = -sincLen; k <= sincLen; ++k) {
                int targetPos = static_cast<int>(resultPos) + k;
                if (targetPos >= 0 && targetPos < sfinfo_frames) {
                    sum += inputData[targetPos * sfinfo_channels + c] * sincTable[i * (2 * sincLen + 1) + k + sincLen];
                }
            }
            outputData[t * sfinfo_channels + c] = sum;
        }
    }

    atomicAdd(progressCounter, 1);
}

extern "C" void callResampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter, cudaStream_t kernelStream) {
    int numBlocks = (outputFrames + 255) / 256;
    resampleKernel<<<numBlocks, 256, 0, kernelStream>>>(inputData, outputData, sfinfo_channels, oversamplingFactor, outputFrames, sincLen, sincTable, sfinfo_frames, progressCounter );
    cudaStreamSynchronize(kernelStream);
}

resample.cppで保存

#include <iostream>
#include <vector>
#include <cmath>
#include <cstring>
#include <sndfile.h>
#include <omp.h>
#include <atomic>
#include <mutex>
#include <thread>
#include <iomanip>
#include <chrono>
#include <cuda_runtime.h>

extern "C" void callResampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter, cudaStream_t kernelStream);

// カイザー窓関数
double kaiserWindow(int n, int N, double beta) {
    double x = 2.0 * n / (N - 1) - 1;
    return std::cyl_bessel_i(0, beta * std::sqrt(1 - std::pow(x, 2))) / std::cyl_bessel_i(0, beta);
}

std::vector<double> precomputedKaiserWindow(int sincLen, double beta){

    std::vector<double> kaiserWindowTable(2 * sincLen + 1);
    
    double collectWindowGain = kaiserWindow(sincLen, 2 * sincLen + 1, beta);

    #pragma omp parallel for
    for (int k = -sincLen; k <= sincLen; ++k) {
        kaiserWindowTable[k + sincLen] = kaiserWindow(k + sincLen, 2 * sincLen + 1, beta) / collectWindowGain;
    }

    kaiserWindowTable[sincLen] = 1.0;

    return kaiserWindowTable;
}

// sinc関数
double sinc(double x) {
    if (x == 0.0) {
        return 1.0;
    }
    return sin(M_PI * x) / (M_PI * x);
}

std::vector<std::vector<double>> precomputedSinc(int sincLen, int oversamplingFactor , std::vector<double> kaiserWindowTable) {

    std::vector<std::vector<double>> sincTable(oversamplingFactor, std::vector<double>(2 * sincLen +1, 0.0));

    #pragma omp parallel for
    for (int i = 1; i < oversamplingFactor; ++i) {
        double resultPos = static_cast<double>(i) / static_cast<double>(oversamplingFactor);
        for (int k = -sincLen; k <= sincLen; ++k) {
            int targetPos = static_cast<int>(resultPos) + k;
            double sampleGap = resultPos - targetPos;
            sincTable[i][k + sincLen] = sinc(sampleGap) * kaiserWindowTable[k + sincLen];
        }
    }

    return sincTable;
}

void resampleWithCUDA(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, const std::vector<std::vector<double>>& sincTable, int sfinfo_frames) {
    double* d_inputData;
    double* d_outputData;
    double* d_sincTable;
    int* d_progressCounter;

    cudaMalloc((void**)&d_inputData, sfinfo_channels * sfinfo_frames * sizeof(double));
    cudaMalloc((void**)&d_outputData, sfinfo_channels * outputFrames * sizeof(double));

    std::vector<double> flattenedSincTable;
    for (const auto& row : sincTable) {
        flattenedSincTable.insert(flattenedSincTable.end(), row.begin(), row.end());
    }
    cudaMalloc((void**)&d_sincTable, flattenedSincTable.size() * sizeof(double));
    cudaMalloc((void**)&d_progressCounter, sizeof(int));

    cudaMemcpy(d_inputData, inputData, sfinfo_channels * sfinfo_frames * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_sincTable, flattenedSincTable.data(), flattenedSincTable.size() * sizeof(double), cudaMemcpyHostToDevice);

    int initialCounter = 0;
    cudaMemcpy(d_progressCounter, &initialCounter, sizeof(int), cudaMemcpyHostToDevice);


    cudaStream_t kernelStream;
    cudaStream_t progressStream;
    cudaStreamCreate(&kernelStream);
    cudaStreamCreate(&progressStream);

    #pragma omp parallel
    {
        #pragma omp single nowait
        {
            int progressCounter = 0;
            double startTime = omp_get_wtime();
            while (progressCounter < outputFrames) {

                cudaMemcpyAsync(&progressCounter, d_progressCounter, sizeof(int), cudaMemcpyDeviceToHost, progressStream);
                
                cudaError_t err = cudaGetLastError();
                if (err != cudaSuccess) {
                    std::cerr << "CUDA Error during progress check: " << cudaGetErrorString(err) << std::endl;
                    break;
                }
                cudaStreamSynchronize(progressStream);

                double elapsedTime = omp_get_wtime() - startTime;
                if (progressCounter >= outputFrames) break;
                if (progressCounter > 0) {

                    double progress = 100.0 * progressCounter / outputFrames;

                    double estimatedTotalTime = elapsedTime * (static_cast<double>(outputFrames) / static_cast<double>(progressCounter));
                    double remainingTime = estimatedTotalTime - elapsedTime;
                    std::string timeUnit;
                    if (remainingTime >= 86400) {
                        remainingTime /= 86400;
                        timeUnit = "日";
                    } else if (remainingTime >= 3600) {
                        remainingTime /= 3600;
                        timeUnit = "時間";
                    } else if (remainingTime >= 60) {
                        remainingTime /= 60;
                        timeUnit = "分";
                    } else {
                        timeUnit = "秒";
                    }

                    std::cout << "\rリサンプリング進行中: " << std::fixed << std::setprecision(2) << progress << " %"
                              << " 残り時間: " << std::setprecision(2) << remainingTime << " " << timeUnit << std::flush;


                }
                std::this_thread::sleep_for(std::chrono::seconds(1));
            }

            std::cout << "\rリサンプリング進行中: 100.00 % 残り時間: 0.00 秒" << std::endl << std::flush;
        }
        #pragma omp single
        {
            callResampleKernel(d_inputData, d_outputData, sfinfo_channels, oversamplingFactor, outputFrames, sincLen, d_sincTable, sfinfo_frames, d_progressCounter , kernelStream);
           
            cudaError_t err = cudaGetLastError();
            if (err != cudaSuccess) {
                std::cerr << "CUDA Error after kernel call: " << cudaGetErrorString(err) << std::endl;
            }

            cudaStreamSynchronize(kernelStream);
        }

    }

    cudaMemcpy(outputData, d_outputData, sfinfo_channels * outputFrames * sizeof(double), cudaMemcpyDeviceToHost);

    cudaStreamDestroy(kernelStream);
    cudaStreamDestroy(progressStream);
    cudaFree(d_inputData);
    cudaFree(d_outputData);
    cudaFree(d_sincTable);
    cudaFree(d_progressCounter);
}

void resampleAudio(const char *inputFile, const char *outputFile, int oversamplingFactor, int sincLen, int numThreads) {
    auto startTime = std::chrono::high_resolution_clock::now();

    std::cout << "データ読み込み中..." << std::endl;

    // 音声ファイルを読み込み
    SF_INFO sfinfo;
    SNDFILE *infile = sf_open(inputFile, SFM_READ, &sfinfo);
    if (!infile) {
        std::cerr << "入力ファイルを開けませんでした: " << inputFile << std::endl;
        return;
    }

    // データを読み込む
    std::vector<double> inputData(sfinfo.frames * sfinfo.channels);
    sf_read_double(infile, inputData.data(), inputData.size());
    sf_close(infile);

    std::cout << "リサンプリング中..." << std::endl;

    // 出力データのサイズを計算
    int outputFrames = sfinfo.frames * oversamplingFactor;
    std::vector<double> outputData(outputFrames * sfinfo.channels);

    std::vector<double> kaiserWindowTable = precomputedKaiserWindow(sincLen,5);
    std::vector<std::vector<double>> sincTable = precomputedSinc(sincLen,oversamplingFactor, kaiserWindowTable);

    // 並列処理でリサンプリング

    resampleWithCUDA(inputData.data(), outputData.data(), sfinfo.channels, oversamplngFactor, outputFrames, sincLen, sincTable, sfinfo.frames);

    // 正規化
    std::cout << "正規化中..." << std::endl;
    double maxAmplitude = 0.0;
    for (double sample : outputData) {
        if (std::abs(sample) > maxAmplitude) {
            maxAmplitude = std::abs(sample);
        }
    }

    if (maxAmplitude > 0.0) {
        for (double &sample : outputData) {
            sample /= maxAmplitude;
        }
    }

    // 出力ファイルに書き込む
    std::cout << "データ書き込み中..." << std::endl;
    SF_INFO outsfinfo;
    outsfinfo.samplerate = sfinfo.samplerate * oversamplingFactor;
    outsfinfo.channels = sfinfo.channels;
    // outsfinfo.format = sfinfo.format;
    outsfinfo.format = SF_FORMAT_WAV | SF_FORMAT_PCM_32;

    SNDFILE *outfile = sf_open(outputFile, SFM_WRITE, &outsfinfo);
    if (!outfile) {
        std::cerr << "出力ファイルを開けませんでした: " << outputFile << std::endl;
        return;
    }

    sf_write_double(outfile, outputData.data(), outputData.size());
    sf_close(outfile);

    std::cout << "リサンプリング完了!" << std::endl;
}

int main(int argc, char *argv[]) {
    if (argc < 3) {
        std::cerr << "使用方法: " << argv[0] << " input.wav output.wav -o <オーバーサンプリング係数> -l <sinc_len> -t <並列処理数>" << std::endl;
        return 1;
    }

    const char *inputFile = argv[1];
    const char *outputFile = argv[2];
    int oversamplingFactor = 2;
    int sincLen = 128;
    int numThreads = 1;

    for (int i = 3; i < argc; i++) {
        if (std::strcmp(argv[i], "-o") == 0) {
            oversamplingFactor = std::atoi(argv[++i]);
        } else if (std::strcmp(argv[i], "-l") == 0) {
            sincLen = std::atoi(argv[++i]);
        } else if (std::strcmp(argv[i], "-t") == 0) {
            numThreads = std::atoi(argv[++i]);
        }
    }

    omp_set_num_threads(numThreads);
    resampleAudio(inputFile, outputFile, oversamplingFactor, sincLen, numThreads);

    return 0;
}

コンパイル方法

resample.cppを置いてあるディレクトリでターミナルを起動
もしくはcdで飛ぶ

nvcc -o resample resample.cpp resample.cu -Xcompiler="-O3 -march=native -funroll-loops -fopenmp" -lcudart -lsndfile

実行

変換したいファイルがinput.wavで、出力したいファイルがoutput.wavのとき

./resample input.wav output.wav -l 1024 -o 2 -t 16

-l sinc_lenを設定(初期値128)
-o オーバーサンプリング倍率(初期値2)
-t マルチスレッド数(初期値1)

メモ

Sinc関数は少し勉強したけど正しいコードかは不明

速い
今回のコードは変換用だけど、384KHz / sinc_len 65536 くらいならきっとリアルタイムに再生できる(RTX3060)

実装予定

  • 窓関数の自動計算(テストコードに実装済み)

  • SincLenの自動計算(テストコードに実装済み)

  • クリップしている波形の再現

  • 非整数倍のリサンプリングの実装(テストコードに実装済み)

  • いろんなフォーマットへの対応

  • リアルタイム変換

いいなと思ったら応援しよう!