7割Copilot君に書いてもらったリサンプリングプログラム(仮)2
経緯を残しているだけなので、(完)の記事が最新です。
CUDA対応で爆速
インストールしとくもの
別途CUDA ToolkitとcuDNNのインストールが必要。
この2つのインストールは面倒なので他の人の記事を見てください。
あとは前回と同じ。
sudo apt update
sudo apt install libsndfile1 libsndfile1-dev
resample.cuで保存
#include <cuda_runtime.h>
#include <iostream>
__global__ void resampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter ) {
int t = blockIdx.x * blockDim.x + threadIdx.x;
if (t >= outputFrames) return;
double resultPos = static_cast<double>(t) / static_cast<double>(oversamplingFactor);
int i = t % oversamplingFactor;
for (int c = 0; c < sfinfo_channels; ++c) {
if (i == 0) {
outputData[t * sfinfo_channels + c] = inputData[static_cast<int>(resultPos) * sfinfo_channels + c];
} else {
double sum = 0.0;
for (int k = -sincLen; k <= sincLen; ++k) {
int targetPos = static_cast<int>(resultPos) + k;
if (targetPos >= 0 && targetPos < sfinfo_frames) {
sum += inputData[targetPos * sfinfo_channels + c] * sincTable[i * (2 * sincLen + 1) + k + sincLen];
}
}
outputData[t * sfinfo_channels + c] = sum;
}
}
atomicAdd(progressCounter, 1);
}
extern "C" void callResampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter, cudaStream_t kernelStream) {
int numBlocks = (outputFrames + 255) / 256;
resampleKernel<<<numBlocks, 256, 0, kernelStream>>>(inputData, outputData, sfinfo_channels, oversamplingFactor, outputFrames, sincLen, sincTable, sfinfo_frames, progressCounter );
cudaStreamSynchronize(kernelStream);
}
resample.cppで保存
#include <iostream>
#include <vector>
#include <cmath>
#include <cstring>
#include <sndfile.h>
#include <omp.h>
#include <atomic>
#include <mutex>
#include <thread>
#include <iomanip>
#include <chrono>
#include <cuda_runtime.h>
extern "C" void callResampleKernel(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, double* sincTable, int sfinfo_frames, int* progressCounter, cudaStream_t kernelStream);
// カイザー窓関数
double kaiserWindow(int n, int N, double beta) {
double x = 2.0 * n / (N - 1) - 1;
return std::cyl_bessel_i(0, beta * std::sqrt(1 - std::pow(x, 2))) / std::cyl_bessel_i(0, beta);
}
std::vector<double> precomputedKaiserWindow(int sincLen, double beta){
std::vector<double> kaiserWindowTable(2 * sincLen + 1);
double collectWindowGain = kaiserWindow(sincLen, 2 * sincLen + 1, beta);
#pragma omp parallel for
for (int k = -sincLen; k <= sincLen; ++k) {
kaiserWindowTable[k + sincLen] = kaiserWindow(k + sincLen, 2 * sincLen + 1, beta) / collectWindowGain;
}
kaiserWindowTable[sincLen] = 1.0;
return kaiserWindowTable;
}
// sinc関数
double sinc(double x) {
if (x == 0.0) {
return 1.0;
}
return sin(M_PI * x) / (M_PI * x);
}
std::vector<std::vector<double>> precomputedSinc(int sincLen, int oversamplingFactor , std::vector<double> kaiserWindowTable) {
std::vector<std::vector<double>> sincTable(oversamplingFactor, std::vector<double>(2 * sincLen +1, 0.0));
#pragma omp parallel for
for (int i = 1; i < oversamplingFactor; ++i) {
double resultPos = static_cast<double>(i) / static_cast<double>(oversamplingFactor);
for (int k = -sincLen; k <= sincLen; ++k) {
int targetPos = static_cast<int>(resultPos) + k;
double sampleGap = resultPos - targetPos;
sincTable[i][k + sincLen] = sinc(sampleGap) * kaiserWindowTable[k + sincLen];
}
}
return sincTable;
}
void resampleWithCUDA(double* inputData, double* outputData, int sfinfo_channels, int oversamplingFactor, int outputFrames, int sincLen, const std::vector<std::vector<double>>& sincTable, int sfinfo_frames) {
double* d_inputData;
double* d_outputData;
double* d_sincTable;
int* d_progressCounter;
cudaMalloc((void**)&d_inputData, sfinfo_channels * sfinfo_frames * sizeof(double));
cudaMalloc((void**)&d_outputData, sfinfo_channels * outputFrames * sizeof(double));
std::vector<double> flattenedSincTable;
for (const auto& row : sincTable) {
flattenedSincTable.insert(flattenedSincTable.end(), row.begin(), row.end());
}
cudaMalloc((void**)&d_sincTable, flattenedSincTable.size() * sizeof(double));
cudaMalloc((void**)&d_progressCounter, sizeof(int));
cudaMemcpy(d_inputData, inputData, sfinfo_channels * sfinfo_frames * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_sincTable, flattenedSincTable.data(), flattenedSincTable.size() * sizeof(double), cudaMemcpyHostToDevice);
int initialCounter = 0;
cudaMemcpy(d_progressCounter, &initialCounter, sizeof(int), cudaMemcpyHostToDevice);
cudaStream_t kernelStream;
cudaStream_t progressStream;
cudaStreamCreate(&kernelStream);
cudaStreamCreate(&progressStream);
#pragma omp parallel
{
#pragma omp single nowait
{
int progressCounter = 0;
double startTime = omp_get_wtime();
while (progressCounter < outputFrames) {
cudaMemcpyAsync(&progressCounter, d_progressCounter, sizeof(int), cudaMemcpyDeviceToHost, progressStream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Error during progress check: " << cudaGetErrorString(err) << std::endl;
break;
}
cudaStreamSynchronize(progressStream);
double elapsedTime = omp_get_wtime() - startTime;
if (progressCounter >= outputFrames) break;
if (progressCounter > 0) {
double progress = 100.0 * progressCounter / outputFrames;
double estimatedTotalTime = elapsedTime * (static_cast<double>(outputFrames) / static_cast<double>(progressCounter));
double remainingTime = estimatedTotalTime - elapsedTime;
std::string timeUnit;
if (remainingTime >= 86400) {
remainingTime /= 86400;
timeUnit = "日";
} else if (remainingTime >= 3600) {
remainingTime /= 3600;
timeUnit = "時間";
} else if (remainingTime >= 60) {
remainingTime /= 60;
timeUnit = "分";
} else {
timeUnit = "秒";
}
std::cout << "\rリサンプリング進行中: " << std::fixed << std::setprecision(2) << progress << " %"
<< " 残り時間: " << std::setprecision(2) << remainingTime << " " << timeUnit << std::flush;
}
std::this_thread::sleep_for(std::chrono::seconds(1));
}
std::cout << "\rリサンプリング進行中: 100.00 % 残り時間: 0.00 秒" << std::endl << std::flush;
}
#pragma omp single
{
callResampleKernel(d_inputData, d_outputData, sfinfo_channels, oversamplingFactor, outputFrames, sincLen, d_sincTable, sfinfo_frames, d_progressCounter , kernelStream);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Error after kernel call: " << cudaGetErrorString(err) << std::endl;
}
cudaStreamSynchronize(kernelStream);
}
}
cudaMemcpy(outputData, d_outputData, sfinfo_channels * outputFrames * sizeof(double), cudaMemcpyDeviceToHost);
cudaStreamDestroy(kernelStream);
cudaStreamDestroy(progressStream);
cudaFree(d_inputData);
cudaFree(d_outputData);
cudaFree(d_sincTable);
cudaFree(d_progressCounter);
}
void resampleAudio(const char *inputFile, const char *outputFile, int oversamplingFactor, int sincLen, int numThreads) {
auto startTime = std::chrono::high_resolution_clock::now();
std::cout << "データ読み込み中..." << std::endl;
// 音声ファイルを読み込み
SF_INFO sfinfo;
SNDFILE *infile = sf_open(inputFile, SFM_READ, &sfinfo);
if (!infile) {
std::cerr << "入力ファイルを開けませんでした: " << inputFile << std::endl;
return;
}
// データを読み込む
std::vector<double> inputData(sfinfo.frames * sfinfo.channels);
sf_read_double(infile, inputData.data(), inputData.size());
sf_close(infile);
std::cout << "リサンプリング中..." << std::endl;
// 出力データのサイズを計算
int outputFrames = sfinfo.frames * oversamplingFactor;
std::vector<double> outputData(outputFrames * sfinfo.channels);
std::vector<double> kaiserWindowTable = precomputedKaiserWindow(sincLen,5);
std::vector<std::vector<double>> sincTable = precomputedSinc(sincLen,oversamplingFactor, kaiserWindowTable);
// 並列処理でリサンプリング
resampleWithCUDA(inputData.data(), outputData.data(), sfinfo.channels, oversamplngFactor, outputFrames, sincLen, sincTable, sfinfo.frames);
// 正規化
std::cout << "正規化中..." << std::endl;
double maxAmplitude = 0.0;
for (double sample : outputData) {
if (std::abs(sample) > maxAmplitude) {
maxAmplitude = std::abs(sample);
}
}
if (maxAmplitude > 0.0) {
for (double &sample : outputData) {
sample /= maxAmplitude;
}
}
// 出力ファイルに書き込む
std::cout << "データ書き込み中..." << std::endl;
SF_INFO outsfinfo;
outsfinfo.samplerate = sfinfo.samplerate * oversamplingFactor;
outsfinfo.channels = sfinfo.channels;
// outsfinfo.format = sfinfo.format;
outsfinfo.format = SF_FORMAT_WAV | SF_FORMAT_PCM_32;
SNDFILE *outfile = sf_open(outputFile, SFM_WRITE, &outsfinfo);
if (!outfile) {
std::cerr << "出力ファイルを開けませんでした: " << outputFile << std::endl;
return;
}
sf_write_double(outfile, outputData.data(), outputData.size());
sf_close(outfile);
std::cout << "リサンプリング完了!" << std::endl;
}
int main(int argc, char *argv[]) {
if (argc < 3) {
std::cerr << "使用方法: " << argv[0] << " input.wav output.wav -o <オーバーサンプリング係数> -l <sinc_len> -t <並列処理数>" << std::endl;
return 1;
}
const char *inputFile = argv[1];
const char *outputFile = argv[2];
int oversamplingFactor = 2;
int sincLen = 128;
int numThreads = 1;
for (int i = 3; i < argc; i++) {
if (std::strcmp(argv[i], "-o") == 0) {
oversamplingFactor = std::atoi(argv[++i]);
} else if (std::strcmp(argv[i], "-l") == 0) {
sincLen = std::atoi(argv[++i]);
} else if (std::strcmp(argv[i], "-t") == 0) {
numThreads = std::atoi(argv[++i]);
}
}
omp_set_num_threads(numThreads);
resampleAudio(inputFile, outputFile, oversamplingFactor, sincLen, numThreads);
return 0;
}
コンパイル方法
resample.cppを置いてあるディレクトリでターミナルを起動
もしくはcdで飛ぶ
nvcc -o resample resample.cpp resample.cu -Xcompiler="-O3 -march=native -funroll-loops -fopenmp" -lcudart -lsndfile
実行
変換したいファイルがinput.wavで、出力したいファイルがoutput.wavのとき
./resample input.wav output.wav -l 1024 -o 2 -t 16
-l sinc_lenを設定(初期値128)
-o オーバーサンプリング倍率(初期値2)
-t マルチスレッド数(初期値1)
メモ
Sinc関数は少し勉強したけど正しいコードかは不明
速い
今回のコードは変換用だけど、384KHz / sinc_len 65536 くらいならきっとリアルタイムに再生できる(RTX3060)
実装予定
窓関数の自動計算(テストコードに実装済み)SincLenの自動計算(テストコードに実装済み)クリップしている波形の再現
非整数倍のリサンプリングの実装(テストコードに実装済み)いろんなフォーマットへの対応
リアルタイム変換