Android는 GPU를 사용하여 JPEG 이미지 디코딩 가속화(Opencl)

26334 단어 GPU 가속
전재는 출처를 표시하십시오:https://blog.csdn.net/u013752202/article/details/92794209
1. opencl kernel 만들기
(1)kerenl을 만들고 shading 파일을 컴파일합니다
(2)kernel 작업 그룹 정보 얻기
2. GPU 및 CPU 공동 작업
(1) 메모리를 생성하고 CPU 및 GPU에 동기화합니다(예:
        tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));
        opencl_sync_mem(tablebuffer, TOCPU);
        opencl_sync_mem(tablebuffer, TOGPU);
(2) 다음과 같은 GPU kernel 매개 변수를 설정합니다.
        int error;
        size_t alignblocks;
        cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);
        opencl_set_mem(kernel, inputBuffer, 0);
        opencl_set_mem(kernel, yuvBuffer, 1);
        opencl_set_mem(kernel, tablebuffer, 2);
        opencl_set_mem(kernel, offsetbuffer, 3);
        error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);
        assert(CL_SUCCESS == error);
        error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);
        assert(CL_SUCCESS == error);
(3) GPU 렌더링 파이프라인을 추가하고 다음 계산이 완료될 때까지 기다립니다.
        error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);
        assert(CL_SUCCESS == error);
        aclFinish(context->queue);
JpegNativeLib.cpp (JNI) :

//
// Created by Administrator on 2017/1/18.
//
extern "C"
{
#include "jpeglib.h"
#include 
};
#include 
#include 
#include 
#include 
#include 

#include 
#define   LOG_TAG    "MYLOG"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)

#include 
using namespace UGLES;

void preDecode(FILE *infile, jpeg_decompress_struct *cinfo)
{
    struct jpeg_error_mgr jerr;
    cinfo->err = jpeg_std_error(&jerr);
    jpeg_stdio_src(cinfo, infile);
    (void) jpeg_read_header(cinfo, TRUE);
    /*    float  ,  cpu  ,       ,   gpu      ,  float      ,          */
    cinfo->dct_method = JDCT_FLOAT;
    (void) jpeg_start_decompress(cinfo);
    auto width = cinfo->output_width;
    auto height = cinfo->output_height;
    LOGI("W=%d,H=%d",width,height);
}

int jpegDecode(GLuint glTextureId,int flag)
{
    LOGI("Java_zhw_jpegclapp_MainActivity_jpegDecode");
{
#if 1
        /*opencl     api*/
        const char *inputfile = "/sdcard/3040x1520_4.jpeg";
        struct jpeg_decompress_struct cinfo;

        jpeg_create_decompress(&cinfo);//     

        FILE *infile = fopen(inputfile, "rb");
        if (NULL == infile) {
            LOGE("Open input file err.");
            return -1;
        }
        preDecode(infile, &cinfo);
        LOGI("GPU DECODE START");
        int ret = jpeg_decode_by_opencl(&cinfo,glTextureId,flag);
        fclose(infile);
        if (-1 == ret) {
            LOGE("jpeg_decode_by_opencl failed");
            return -1;
        }
        LOGI("GPU DECODE END");
#endif
    //    pixels     

    //   abort   finish,     
    (void) jpeg_abort_decompress(&cinfo);
    jpeg_destroy_decompress(&cinfo);
    return 0;
}

}

UsrGLES usrGLES;
GLuint textureId=0;

extern "C"
void Java_zhw_jpegclapp_GL2JNILib_init(JNIEnv * env, jobject obj,  jint width, jint height)
{
    usrGLES.setupGraphics(width, height);
    textureId=usrGLES.getTextureID();
    if (-1 == openclInit()) {
        LOGE("openclInit failed");
        return;
    }
    if(-1==jpegDecode(textureId,0)) {
        LOGE("jpegDecode error !");
        return;
    }
}

extern "C"
void Java_zhw_jpegclapp_GL2JNILib_step(JNIEnv * env, jobject obj)
{
    TIME_START;
    if(-1==jpegDecode(textureId,1)) {
        LOGE("jpegDecode error !");
        return;
    }
    TIME_END;
    usrGLES.renderFrame();
}

extern "C"
void Java_zhw_jpegclapp_GL2JNILib_stop(JNIEnv * env, jobject obj)
{
    openclDestroy();
}

jpeg_opencl.h

#ifndef JPEG_OPENCL_H
#define JPEG_OPENCL_H
#include "opencl_package.h"
#include 
#include "jpeglib.h"
#include "jpegint.h"
#define DEUBUG_ON
#ifdef DEUBUG_ON
#include 

#include 
#include 

#define   LOG_TAG    "MYLOG"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)

#define TIME_START struct timeval tv_start, tv_end;gettimeofday(&tv_start, NULL);
#define TIME_END gettimeofday(&tv_end, NULL); LOGI("Time = %fms in %s, %d
", (tv_end.tv_sec*1000000 + tv_end.tv_usec - tv_start.tv_sec*1000000 - tv_start.tv_usec)/1000.0f, __FUNCTION__, __LINE__);gettimeofday(&tv_start, NULL); #else #define TIME_START #define TIME_END #endif opencl_context* jpeg_get_context(); typedef enum { IDCT_FLOAT=0, YUV_RGB, KERNELNUMBER }KERNELNAME; cl_kernel jpeg_get_kernel(KERNELNAME name); int jpeg_decode_by_opencl(j_decompress_ptr cinfo, GLuint glTextureId,int flag); int openclInit(); void openclDestroy(); #endif

jpeg_opencl.c

#include "jpeg_opencl.h"
#include 
#include 
#include 
#include 

#include 
#include 
#include 

#include "aopencl.h"

#define   LOG_TAG    "CL"
#define   LOGI(...)  __android_log_print(ANDROID_LOG_INFO,LOG_TAG,__VA_ARGS__)
#define   LOGE(...)  __android_log_print(ANDROID_LOG_ERROR,LOG_TAG,__VA_ARGS__)

static opencl_context* gInstance = NULL;
static pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
static cl_kernel gKernel[KERNELNUMBER];
static size_t gMaxGroupSize[KERNELNUMBER];
#include "KernelWarp.h"
static void initKernel(opencl_context* context)
{
    int error;
    gKernel[IDCT_FLOAT] = opencl_compile_create_kernel(context, idct_kernel_clclh, "idct_float");
    gKernel[YUV_RGB] = opencl_compile_create_kernel(context,yuv_rgb_clclh, "yuv_rgb");
    LOGI("gInstance->device_id=0x%x",gInstance->device_id);
    error = aclGetKernelWorkGroupInfo(gKernel[IDCT_FLOAT], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+IDCT_FLOAT, NULL);
    LOGI("(gMaxGroupSize+IDCT_FLOAT)=%d",*(gMaxGroupSize+IDCT_FLOAT));
    assert(CL_SUCCESS == error);
    error = aclGetKernelWorkGroupInfo(gKernel[YUV_RGB], gInstance->device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), gMaxGroupSize+YUV_RGB, NULL);
    assert(CL_SUCCESS == error);
}

opencl_context* jpeg_get_context()
{
    if (NULL == gInstance)
    {
        pthread_mutex_lock(&gMutex);
        if (NULL == gInstance)
        {
            gInstance = opencl_create_context(1);
            initKernel(gInstance);
        }
        pthread_mutex_unlock(&gMutex);
    }
    return gInstance;
}
cl_kernel jpeg_get_kernel(KERNELNAME name)
{
    assert(NULL!=gInstance);
    return gKernel[name];
}

struct ComponentInfo
{
    int max_x_sample;
    int max_y_sample;
    int YW;
    int YH;
    int UW;
    int UH;
    int UOffset;
    int VW;
    int VH;
    int VOffset;
    int blocksInMCU;
    int MCU_Per_Row;
};

opencl_mem* rgbBuffer;

opencl_context* context;
opencl_mem* inputBuffer;
JBLOCK* mcu;
JBLOCK** MCU_buffer;
opencl_mem* yuvBuffer;
opencl_mem* tablebuffer;
opencl_mem* offsetbuffer;
int currentoffset = 0;
size_t totalBlock;
size_t totalMCUNumber;

int openclInit()
{
    initFns();
    TIME_START;
    /*
    if (cinfo->dct_method != JDCT_FLOAT || cinfo->comps_in_scan != 3 || DCTSIZE != 8)
    {
        return -1;
    }*/
    context = jpeg_get_context();
    TIME_END;
    return 0;
}

void openclDestroy()
{
    opencl_destroy_mem(yuvBuffer);
    opencl_destroy_mem(rgbBuffer);
    opencl_destroy_mem(inputBuffer);
    opencl_destroy_context(context);
}

int gloalValInit(j_decompress_ptr cinfo,GLuint glTextureId)
{
    TIME_START;
    totalMCUNumber = cinfo->MCU_rows_in_scan * cinfo->MCUs_per_row;
    totalBlock = totalMCUNumber*cinfo->blocks_in_MCU;
    for (int i=0;i<3; ++i)
    {
        assert(cinfo->cur_comp_info[i]->DCT_scaled_size == 8);
    }
    inputBuffer = opencl_create_mem(context, totalBlock * sizeof(JBLOCK));
    TIME_END;
    cl_int err;
    LOGI("glTextureId =%d",glTextureId);
    rgbBuffer = (opencl_mem *) malloc(sizeof(opencl_mem));
    rgbBuffer->size = 4 * sizeof(unsigned char) * DCTSIZE2 * cinfo->MCU_rows_in_scan *
                      cinfo->MCUs_per_row * cinfo->max_h_samp_factor * cinfo->max_v_samp_factor;
    rgbBuffer->queue = context->queue;
    rgbBuffer->map = NULL;
    rgbBuffer->base = aclCreateFromGLTexture(context->context, CL_MEM_READ_WRITE, GL_TEXTURE_2D,
                                            0, glTextureId, &err);
    if (0 != err) {//CL_INVALID_CONTEXT
        LOGE("clCreateFromGLTexture2D error,%p,err=%d", rgbBuffer->base, err);
        return -1;
    }
    else {
        LOGI("clCreateFromGLTexture success,rgbBuffer->base=%p", rgbBuffer->base);
    }
    yuvBuffer = opencl_create_mem(context, totalMCUNumber*cinfo->blocks_in_MCU * DCTSIZE2 * sizeof(unsigned char));
    TIME_END;
    return 0;
}

int jpeg_decode_by_opencl(j_decompress_ptr cinfo,GLuint glTextureId,int flag)
{
    if(0==flag){
        if(-1==gloalValInit(cinfo,glTextureId)){
            return  -1;
        }
    }
    TIME_START;
    opencl_sync_mem(inputBuffer, TOCPU);
    TIME_END;
    jzero_far(inputBuffer->map, totalMCUNumber*cinfo->blocks_in_MCU * sizeof(JBLOCK));
    MCU_buffer = (JBLOCK**)malloc(sizeof(JBLOCK*)*cinfo->blocks_in_MCU);
    mcu = (JBLOCK*)inputBuffer->map;
    TIME_END;
    LOGI("totalMCUNumber=%d",totalMCUNumber);
    int i, blkn;
    for (i = 0; i < totalMCUNumber; ++i)
    {
        for (blkn = 0; blkn < cinfo->blocks_in_MCU; ++blkn)
        {
            MCU_buffer[blkn] = mcu + cinfo->blocks_in_MCU*i + blkn;
        }
        if ( FALSE == (*cinfo->entropy->decode_mcu) (cinfo, MCU_buffer))
        {
            break;
        }
    }
    free(MCU_buffer);
    opencl_sync_mem(inputBuffer, TOGPU);
    TIME_END;
    /*Upload quantry table*/
    offsetbuffer = opencl_create_mem(context, cinfo->blocks_in_MCU*sizeof(int));
    tablebuffer = opencl_create_mem(context, cinfo->comps_in_scan*DCTSIZE2*sizeof(float));
    opencl_sync_mem(tablebuffer, TOCPU);
    opencl_sync_mem(offsetbuffer, TOCPU);
    for (i = 0; i< cinfo->comps_in_scan; ++i)
    {
        int j;
        jpeg_component_info* info = cinfo->cur_comp_info[i];
        float* table = (float*)info->dct_table;//FIXME
        float* table_in_buffer = (float*)(tablebuffer->map) + DCTSIZE2*i;
        int* offset = (int*)offsetbuffer->map;
        memcpy(table_in_buffer, table, sizeof(float)*DCTSIZE2);
        for (j=0; jh_samp_factor*info->v_samp_factor; ++j)
        {
            offset[currentoffset + j] = i;
        }
        currentoffset+=info->h_samp_factor*info->v_samp_factor;
    }
    TIME_END;
    opencl_sync_mem(tablebuffer, TOGPU);
    opencl_sync_mem(offsetbuffer, TOGPU);
    /*idct*/
    {
        int error;
        size_t alignblocks;
        cl_kernel kernel = jpeg_get_kernel(IDCT_FLOAT);
        opencl_set_mem(kernel, inputBuffer, 0);
        opencl_set_mem(kernel, yuvBuffer, 1);
        opencl_set_mem(kernel, tablebuffer, 2);
        opencl_set_mem(kernel, offsetbuffer, 3);
        error = aclSetKernelArg(kernel, 4, sizeof(int), &cinfo->blocks_in_MCU);
        assert(CL_SUCCESS == error);
        error = aclSetKernelArg(kernel, 5, sizeof(size_t), &totalBlock);
        assert(CL_SUCCESS == error);
        alignblocks = (totalBlock + gMaxGroupSize[IDCT_FLOAT]-1)/gMaxGroupSize[IDCT_FLOAT]*gMaxGroupSize[IDCT_FLOAT];
        error = aclEnqueueNDRangeKernel(context->queue, kernel, 1, NULL, &alignblocks, gMaxGroupSize+IDCT_FLOAT, 0, NULL, NULL);
        assert(CL_SUCCESS == error);
        aclFinish(context->queue);
    }
    opencl_destroy_mem(tablebuffer);
    opencl_destroy_mem(offsetbuffer);
    TIME_END;
    /*TODO Sample YUV to RGB*/
    {
        int stride = cinfo->MCUs_per_row * cinfo->max_h_samp_factor * DCTSIZE;
        struct ComponentInfo info;
        size_t global[2] = {cinfo->MCUs_per_row*cinfo->max_h_samp_factor, cinfo->MCU_rows_in_scan*cinfo->max_v_samp_factor*DCTSIZE};
        cl_kernel kernel = jpeg_get_kernel(YUV_RGB);
        opencl_sync_mem(rgbBuffer, TOGPU);
        assert(NULL!=rgbBuffer);
        info.YW = cinfo->cur_comp_info[0]->h_samp_factor;
        info.YH = cinfo->cur_comp_info[0]->v_samp_factor;
        info.UW = cinfo->cur_comp_info[1]->h_samp_factor;
        info.UH = cinfo->cur_comp_info[1]->v_samp_factor;
        info.VW = cinfo->cur_comp_info[2]->h_samp_factor;
        info.VH = cinfo->cur_comp_info[2]->v_samp_factor;
        info.UOffset = info.YW*info.YH;
        info.VOffset = info.UOffset + info.UW*info.UH;
        info.blocksInMCU = cinfo->blocks_in_MCU;
        info.MCU_Per_Row = cinfo->MCUs_per_row;
        info.max_x_sample = cinfo->max_h_samp_factor;
        info.max_y_sample = cinfo->max_v_samp_factor;
//        LOGI("%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",info.YW,info.YH,info.UW,info.UH,info.VW,info.VH,info.UOffset,info.VOffset,info.blocksInMCU,info.MCU_Per_Row,info.max_x_sample,info.max_y_sample);
//        LOGI("stride=%d",stride);
        {
            int error ;
            opencl_set_mem(kernel, yuvBuffer, 0);
            opencl_set_mem(kernel, rgbBuffer, 1);
            aclSetKernelArg(kernel, 3, sizeof(int), &info.max_x_sample);
            aclSetKernelArg(kernel, 4, sizeof(int), &info.max_y_sample);
            aclSetKernelArg(kernel, 5, sizeof(int), &info.YW);
            aclSetKernelArg(kernel, 6, sizeof(int), &info.YH);
            aclSetKernelArg(kernel, 7, sizeof(int), &info.UW);
            aclSetKernelArg(kernel, 8, sizeof(int), &info.UH);
            aclSetKernelArg(kernel, 9, sizeof(int), &info.UOffset);
            aclSetKernelArg(kernel, 10, sizeof(int), &info.VW);
            aclSetKernelArg(kernel, 11, sizeof(int), &info.VH);
            aclSetKernelArg(kernel, 12, sizeof(int), &info.VOffset);
            aclSetKernelArg(kernel, 13, sizeof(int), &info.blocksInMCU);
            aclSetKernelArg(kernel, 14, sizeof(int), &info.MCU_Per_Row);
            assert(CL_SUCCESS == error);
            error = aclSetKernelArg(kernel, 2, sizeof(int), &stride);
            assert(CL_SUCCESS == error);
            error = aclEnqueueNDRangeKernel(context->queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);
            assert(CL_SUCCESS == error);
            aclFinish(context->queue);
        }
        /*Copy rgbbuffer to output_buf*/
        TIME_END;
    }
    //opencl_destroy_mem(yuvBuffer);
    //opencl_destroy_mem(rgbBuffer);
    LOGI("jpeg_decode_by_opencl END");
    return 1;
}

KernelWrap.h(shading 프로그램)

const char* idct_kernel_clclh = 
"#define DCTSIZE 8
" "#define DCTSIZE2 64
" "#define LOADSRC(i, src) convert_float8(vload8(i, src))*vload8(i, table)
" "__kernel void idct_float(__global short* input, __global unsigned char* output, __global const float* dequantilize_table, __global const int* order, int blocks_per_mcu, uint totalblocks)
" "{
" " int blkn = get_global_id(0);
" " if (blkn < totalblocks)
" " {
" " __global short* src = input + DCTSIZE2*blkn;
" " __global unsigned char* outptr = output + DCTSIZE2*blkn;
" " __global const float* table = dequantilize_table + order[blkn % blocks_per_mcu]*DCTSIZE2;
" " float8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
" " float8 w0, w1, w2, w3, w4, w5, w6, w7;
" " float8 tmp10, tmp11, tmp12, tmp13;
" " float8 z5, z10, z11, z12, z13;
" " tmp0 = LOADSRC(0, src);
" " tmp1 = LOADSRC(2, src);
" " tmp2 = LOADSRC(4, src);
" " tmp3 = LOADSRC(6, src);
" " tmp10 = tmp0 + tmp2; /* phase 3 */
" " tmp11 = tmp0 - tmp2;
" "
" " tmp13 = tmp1 + tmp3; /* phases 5-3 */
" " tmp12 = (tmp1 - tmp3) * (float8)1.414213562 - tmp13; /* 2*c4 */
" "
" " tmp0 = tmp10 + tmp13; /* phase 2 */
" " tmp3 = tmp10 - tmp13;
" " tmp1 = tmp11 + tmp12;
" " tmp2 = tmp11 - tmp12;
" "
" " tmp4 = LOADSRC(1, src);
" " tmp5 = LOADSRC(3, src);
" " tmp6 = LOADSRC(5, src);
" " tmp7 = LOADSRC(7, src);
" "
" " z13 = tmp6 + tmp5; /* phase 6 */
" " z10 = tmp6 - tmp5;
" " z11 = tmp4 + tmp7;
" " z12 = tmp4 - tmp7;
" "
" " tmp7 = z11 + z13; /* phase 5 */
" " tmp11 = (z11 - z13) * (float8)(1.414213562); /* 2*c4 */
" "
" " z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */
" " tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */
" " tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */
" "
" " tmp6 = tmp12 - tmp7; /* phase 2 */
" " tmp5 = tmp11 - tmp6;
" " tmp4 = tmp10 + tmp5;
" "
" " tmp0 = tmp0 + tmp7;
" " tmp7 = tmp0 - (float8)(2)*tmp7;
" " tmp1 = tmp1 + tmp6;
" " tmp6 = tmp1 - (float8)(2)*tmp6;
" " tmp2 = tmp2 + tmp5;
" " tmp5 = tmp2 - (float8)(2)*tmp5;
" " tmp4 = tmp3 + tmp4;
" " tmp3 = (float8)(2)*tmp3 - tmp4;
" " /*Cross*/
" "#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)
" " TRANS(w, 0);
" " TRANS(w, 1);
" " TRANS(w, 2);
" " TRANS(w, 3);
" " TRANS(w, 4);
" " TRANS(w, 5);
" " TRANS(w, 6);
" " TRANS(w, 7);
" "#undef TRANS
" "
" " tmp10 = w0 + w4;
" " tmp11 = w0 - w4;
" "
" " tmp13 = w2 + w6;
" " tmp12 = (w2 - w6) * (float8)(1.414213562) - tmp13;
" "
" " tmp0 = tmp10 + tmp13;
" " tmp3 = tmp10 - tmp13;
" " tmp1 = tmp11 + tmp12;
" " tmp2 = tmp11 - tmp12;
" "
" " z13 = w5 + w3;
" " z10 = w5 - w3;
" " z11 = w1 + w7;
" " z12 = w1 - w7;
" "
" " tmp7 = z11 + z13;
" " tmp11 = (z11 - z13) * (float8)(1.414213562);
" "
" " z5 = (z10 + z12) * (float8)(1.847759065); /* 2*c2 */
" " tmp10 = (float8)(1.082392200) * z12 - z5; /* 2*(c2-c6) */
" " tmp12 = (float8)(-2.613125930) * z10 + z5; /* -2*(c2+c6) */
" "
" " tmp6 = tmp12 - tmp7;
" " tmp5 = tmp11 - tmp6;
" " tmp4 = tmp10 + tmp5;
" "
" " tmp0 = tmp0 + tmp7;
" " tmp7 = tmp0 - (float8)(2)*tmp7;
" " tmp1 = tmp1 + tmp6;
" " tmp6 = tmp1 - (float8)(2)*tmp6;
" " tmp2 = tmp2 + tmp5;
" " tmp5 = tmp2 - (float8)(2)*tmp5;
" " tmp4 = tmp3 + tmp4;
" " tmp3 = (float8)(2)*tmp3 - tmp4;
" " /*Cross*/
" "#define TRANS(w, i) w##i = (float8)(tmp0.s##i, tmp1.s##i, tmp2.s##i, tmp3.s##i, tmp4.s##i, tmp5.s##i, tmp6.s##i, tmp7.s##i)
" " TRANS(w, 0);
" " TRANS(w, 1);
" " TRANS(w, 2);
" " TRANS(w, 3);
" " TRANS(w, 4);
" " TRANS(w, 5);
" " TRANS(w, 6);
" " TRANS(w, 7);
" "#undef TRANS
" " /* Final output stage: scale down by a factor of 8 and range-limit */
" "#define RESULT(t) convert_uchar8(clamp((t)/(float8)(8)+(float8)(128), (float8)(0), (float8)(255)))
" " vstore8(RESULT(w0), 0, outptr);
" " vstore8(RESULT(w7), 7, outptr);
" " vstore8(RESULT(w1), 1, outptr);
" " vstore8(RESULT(w6), 6, outptr);
" " vstore8(RESULT(w2), 2, outptr);
" " vstore8(RESULT(w5), 5, outptr);
" " vstore8(RESULT(w4), 4, outptr);
" " vstore8(RESULT(w3), 3, outptr);
" "#undef RESULT
" " }
" "}
" ; const char* yuv_rgb_clclh = "#define DCTSIZE2 64
" "#define DCTSIZE 8
" "struct ComponentInfo
" "{
" " int max_x_sample;
" " int max_y_sample;
" " int YW;
" " int YH;
" " int UW;
" " int UH;
" " int UOffset;
" " int VW;
" " int VH;
" " int VOffset;
" " int blocksInMCU;
" " int MCU_Per_Row;
" "};
" "__kernel void yuv_rgb(__global unsigned char* yuvbuffer,__write_only image2d_t rgba, int output_stride,int max_x_sample,int max_y_sample," "int YW,int YH,int UW,int UH,int UOffset,int VW,int VH,int VOffset,int blocksInMCU,int MCU_Per_Row)
" "{
" " struct ComponentInfo info;
" "info.max_x_sample=max_x_sample;
" "info.max_y_sample=max_y_sample;
" "info.YW=YW;
" "info.YH=YH;
" "info.UW=UW;
" "info.UH=UH;
" "info.UOffset=UOffset;
" "info.VW=VW;
" "info.VH=VH;
" "info.VOffset=VOffset;
" "info.blocksInMCU=blocksInMCU;
" "info.MCU_Per_Row=MCU_Per_Row;
" " int x = get_global_id(0);
" " int y_origin = get_global_id(1);
" " int yoffset = y_origin % DCTSIZE;
" " int y = y_origin/DCTSIZE;
" " int mcux = x/info.max_x_sample;
" " int mcuy = y/info.max_y_sample;
" " __global unsigned char* basic = yuvbuffer + DCTSIZE2*info.blocksInMCU*(info.MCU_Per_Row*mcuy + mcux);
" " __global unsigned char* Y = basic + DCTSIZE2*((x%info.YW) + 2*(y%info.YH)) + DCTSIZE*yoffset;
" " __global unsigned char* U = basic + DCTSIZE2*((x%info.UW) + 2*(y%info.UH)+info.UOffset) + DCTSIZE*yoffset;
" " __global unsigned char* V = basic + DCTSIZE2*((x%info.VW) + 2*(y%info.VH)+info.VOffset) + DCTSIZE*yoffset;
" " float8 yy = convert_float8(vload8(0, Y));
" " float8 uu = convert_float8(vload8(0, U)) - (float8)(128);
" " float8 vv = convert_float8(vload8(0, V)) - (float8)(128);
" " float8 r, g, b;
" "#define RESULT(x) (clamp(x, (float8)(0), (float8)(255)))
" " r = RESULT((yy + (float8)(1.40200)*vv)/(float8)255.0);
" " g = RESULT((yy - (float8)(0.34414)*uu - (float8)(0.71414)*vv)/(float8)255.0);
" " b = RESULT((yy + (float8)(1.77200)*uu)/(float8)255.0);
" "#undef RESULT
" " int2 coord= (int2)(x*DCTSIZE, y_origin);
" " float4 color=(float4)(r.s0, g.s0, b.s0, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+1, y_origin);
" " color=(float4)(r.s1, g.s1, b.s1, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+2, y_origin);
" " color=(float4)(r.s2, g.s2, b.s2, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+3, y_origin);
" " color=(float4)(r.s3, g.s3, b.s3, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+4, y_origin);
" " color=(float4)(r.s4, g.s4, b.s4, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+5, y_origin);
" " color=(float4)(r.s5, g.s5, b.s5, 0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+6, y_origin);
" " color=(float4)(r.s6, g.s6, b.s6,0);
" " write_imagef(rgba,coord,color);
" " coord= (int2)(x*DCTSIZE+7, y_origin);
" " color=(float4)(r.s7, g.s7, b.s7, 0);
" " write_imagef(rgba,coord,color);
" "}
" ;

전재는 출처를 표시하십시오:https://blog.csdn.net/u013752202/article/details/92794209

좋은 웹페이지 즐겨찾기