Halide學習筆記----Halide tutorial源碼閱讀3

Halide入門教程03


// Halide tutorial lesson 3: Inspecting the generated code
// Halide入門第三課:檢測生成代碼

// This lesson demonstrates how to inspect what the Halide compiler is producing.
// 本課揭示了怎樣查看Halide編譯器做了些什麼

// On linux, you can compile and run it like so:
// linux操作系統,按如下操作編譯和運行
// g++ lesson_03*.cpp -g -I ../include -L ../bin -lHalide -lpthread -ldl -o lesson_03 -std=c++11
// LD_LIBRARY_PATH=../bin ./lesson_03

// On os x:
// g++ lesson_03*.cpp -g -I ../include -L ../bin -lHalide -o lesson_03 -std=c++11
// DYLD_LIBRARY_PATH=../bin ./lesson_03

// If you have the entire Halide source tree, you can also build it by
// running:
//    make tutorial_lesson_03_debugging_1
// in a shell with the current directory at the top of the halide
// source tree.

#include "Halide.h"
#include <stdio.h>

// This time we'll just import the entire Halide namespace
using namespace Halide;

int main(int argc, char **argv) {

    // We'll start by defining the simple single-stage imaging
    // pipeline from lesson 1.

    // This lesson will be about debugging, but unfortunately in C++,
    // objects don't know their own names, which makes it hard for us
    // to understand the generated code. To get around this, you can
    // pass a string to the Func and Var constructors to give them a
    // name for debugging purposes.
    // 本課主要陳述調試(debugging),但c++中的對象並沒有自己的名字標籤,這就給理解生成代碼增加了困難
    // 爲了克服這個問題,你可以給Func和Var的構造函數傳入一個string類型的名字,從而達到方便調試的目的
    Func gradient("gradient");
    Var x("x"), y("y");
    gradient(x, y) = x + y;

    // Realize the function to produce an output image. We'll keep it
    // very small for this lesson.
    Buffer<int> output = gradient.realize(8, 8);

    // That line compiled and ran the pipeline. Try running this
    // lesson with the environment variable HL_DEBUG_CODEGEN set to
    // 1. It will print out the various stages of compilation, and a
    // pseudocode representation of the final pipeline.
    // 設置環境變量HL_DEBUG_CODEGEN=1,此時運行程序會打印出編譯的不同階段和最終pipeline的僞代碼
    // export HL_DEBUG_CODEGEN=1

    // If you set HL_DEBUG_CODEGEN to a higher number, you can see
    // more and more details of how Halide compiles your pipeline.
    // Setting HL_DEBUG_CODEGEN=2 shows the Halide code at each stage
    // of compilation, and also the llvm bitcode we generate at the
    // end.
    // 設置HL_DEBUG_CODEGEN=2,此時會輸出Halide編譯的各個不同階段,而且會輸出llvm最終生成的字節碼
    // export HL_DEBUG_CODEGEN=2

    // Halide will also output an HTML version of this output, which
    // supports syntax highlighting and code-folding, so it can be
    // nicer to read for large pipelines. Open gradient.html with your
    // browser after running this tutorial.
    // Halide也提供HTML形式的僞代碼輸出,支持語法高亮,代碼摺疊,翻遍大規模複雜pipeline的閱讀
    gradient.compile_to_lowered_stmt("gradient.html", {}, HTML);

    // You can usually figure out what code Halide is generating using
    // this pseudocode. In the next lesson we'll see how to snoop on
    // Halide at runtime.

    printf("Success!\n");
    return 0;
}

在終端中編譯並運行:

$ g++ lesson_03*.cpp -g -I ../include -L ../bin -lHalide -lpthread -ldl -o lesson_03 -std=c++11
$ export HL_DEBUG_CODEGEN=1
$ ./lesson_03

終端輸出結果:

Inferred argument: (void *) __user_context
Creating initial loop nests...
Injecting realization of gradient
Canonicalizing GPU var names...
Skipping injecting memoization...
Injecting tracing...
Adding checks for parameters
Computing bounds of each function's value
Adding checks for images
Performing computation bounds inference...
Performing sliding window optimization...
Performing allocation bounds inference...
Removing code that depends on undef values...
Uniquifying variable names...
Performing storage folding optimization...
Injecting debug_to_file calls...
Simplifying...
Injecting prefetches...
Dynamically skipping stages...
Destructuring tuple-valued realizations...
Performing storage flattening...
Unpacking buffer arguments...
Skipping rewriting memoized allocations...
Simplifying...
Reduce prefetch dimension...
Unrolling...
Vectorizing...
Detecting vector interleavings...
Partitioning loops to simplify boundary conditions...
Trimming loops to the region over which they do something...
Injecting early frees...
Simplifying...
Lowering after final simplification:
assert((reinterpret(uint64, gradient.buffer) != (uint64)0), halide_error_buffer_argument_is_null("gradient"))
let gradient = _halide_buffer_get_host(gradient.buffer)
let gradient.type.code = _halide_buffer_get_type_code(gradient.buffer)
let gradient.type.bits = _halide_buffer_get_type_bits(gradient.buffer)
let gradient.type.lanes = _halide_buffer_get_type_lanes(gradient.buffer)
let gradient.min.0 = _halide_buffer_get_min(gradient.buffer, 0)
let gradient.extent.0 = _halide_buffer_get_extent(gradient.buffer, 0)
let gradient.stride.0 = _halide_buffer_get_stride(gradient.buffer, 0)
let gradient.min.1 = _halide_buffer_get_min(gradient.buffer, 1)
let gradient.extent.1 = _halide_buffer_get_extent(gradient.buffer, 1)
let gradient.stride.1 = _halide_buffer_get_stride(gradient.buffer, 1)
if (_halide_buffer_is_bounds_query(gradient.buffer)) {
  _halide_buffer_init(gradient.buffer, _halide_buffer_get_shape(gradient.buffer), reinterpret((void *), (uint64)0), (uint64)0, reinterpret((halide_device_interface_t *), (uint64)0), 0, 32, 2, make_struct((halide_dimension_t *), gradient.min.0, gradient.extent.0, 1, 0, gradient.min.1, gradient.extent.1, gradient.extent.0, 0), (uint64)0)
}
if (!_halide_buffer_is_bounds_query(gradient.buffer)) {
  assert((((gradient.type.code == (uint8)0) && (gradient.type.bits == (uint8)32)) && (gradient.type.lanes == (uint16)1)), halide_error_bad_type("Output buffer gradient", gradient.type.code, (uint8)0, gradient.type.bits, (uint8)32, gradient.type.lanes, (uint16)1))
  assert((0 <= gradient.extent.0), halide_error_buffer_extents_negative("Output buffer gradient", 0, gradient.extent.0))
  assert((0 <= gradient.extent.1), halide_error_buffer_extents_negative("Output buffer gradient", 1, gradient.extent.1))
  assert((gradient.stride.0 == 1), halide_error_constraint_violated("gradient.stride.0", gradient.stride.0, "1", 1))
  let gradient.total_extent.1 = (int64(gradient.extent.1)*int64(gradient.extent.0))
  assert((abs(int64(gradient.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs(int64(gradient.extent.0)), (uint64)2147483647))
  assert((abs((int64(gradient.extent.1)*int64(gradient.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs((int64(gradient.extent.1)*int64(gradient.stride.1))), (uint64)2147483647))
  assert((gradient.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("gradient", gradient.total_extent.1, (int64)2147483647))
  assert((gradient != reinterpret((void *), (uint64)0)), halide_error_host_is_null("Output buffer gradient"))
  produce gradient {
    let t17 = (gradient.min.0 + (gradient.min.1*gradient.stride.1))
    for (gradient.s0.y, gradient.min.1, gradient.extent.1) {
      for (gradient.s0.x, gradient.min.0, gradient.extent.0) {
        gradient[((gradient.s0.x + (gradient.s0.y*gradient.stride.1)) - t17)] = (gradient.s0.x + gradient.s0.y)
      }
    }
  }
}


Splitting off Hexagon offload...
Hexagon device code module: Target = hexagon-32-noos-no_runtime

Target triple of initial module: x86_64-pc-windows-gnu-elf
Generating llvm bitcode...
Generating llvm bitcode prolog for function gradient...
Generating llvm bitcode for function gradient...
JIT compiling shared runtime
JIT compiling gradient
JIT input scalar argument __user_context @ 0x852368
JIT output buffer @ 0x84f738, 0x852480
Creating initial loop nests...
Injecting realization of gradient
Canonicalizing GPU var names...
Skipping injecting memoization...
Injecting tracing...
Adding checks for parameters
Computing bounds of each function's value
Adding checks for images
Performing computation bounds inference...
Performing sliding window optimization...
Performing allocation bounds inference...
Removing code that depends on undef values...
Uniquifying variable names...
Performing storage folding optimization...
Injecting debug_to_file calls...
Simplifying...
Injecting prefetches...
Dynamically skipping stages...
Destructuring tuple-valued realizations...
Performing storage flattening...
Unpacking buffer arguments...
Skipping rewriting memoized allocations...
Simplifying...
Reduce prefetch dimension...
Unrolling...
Vectorizing...
Detecting vector interleavings...
Partitioning loops to simplify boundary conditions...
Trimming loops to the region over which they do something...
Injecting early frees...
Simplifying...
Lowering after final simplification:
assert((reinterpret(uint64, gradient.buffer) != (uint64)0), halide_error_buffer_argument_is_null("gradient"))
let gradient = _halide_buffer_get_host(gradient.buffer)
let gradient.type.code = _halide_buffer_get_type_code(gradient.buffer)
let gradient.type.bits = _halide_buffer_get_type_bits(gradient.buffer)
let gradient.type.lanes = _halide_buffer_get_type_lanes(gradient.buffer)
let gradient.min.0 = _halide_buffer_get_min(gradient.buffer, 0)
let gradient.extent.0 = _halide_buffer_get_extent(gradient.buffer, 0)
let gradient.stride.0 = _halide_buffer_get_stride(gradient.buffer, 0)
let gradient.min.1 = _halide_buffer_get_min(gradient.buffer, 1)
let gradient.extent.1 = _halide_buffer_get_extent(gradient.buffer, 1)
let gradient.stride.1 = _halide_buffer_get_stride(gradient.buffer, 1)
if (_halide_buffer_is_bounds_query(gradient.buffer)) {
  _halide_buffer_init(gradient.buffer, _halide_buffer_get_shape(gradient.buffer), reinterpret((void *), (uint64)0), (uint64)0, reinterpret((halide_device_interface_t *), (uint64)0), 0, 32, 2, make_struct((halide_dimension_t *), gradient.min.0, gradient.extent.0, 1, 0, gradient.min.1, gradient.extent.1, gradient.extent.0, 0), (uint64)0)
}
if (!_halide_buffer_is_bounds_query(gradient.buffer)) {
  assert((((gradient.type.code == (uint8)0) && (gradient.type.bits == (uint8)32)) && (gradient.type.lanes == (uint16)1)), halide_error_bad_type("Output buffer gradient", gradient.type.code, (uint8)0, gradient.type.bits, (uint8)32, gradient.type.lanes, (uint16)1))
  assert((0 <= gradient.extent.0), halide_error_buffer_extents_negative("Output buffer gradient", 0, gradient.extent.0))
  assert((0 <= gradient.extent.1), halide_error_buffer_extents_negative("Output buffer gradient", 1, gradient.extent.1))
  assert((gradient.stride.0 == 1), halide_error_constraint_violated("gradient.stride.0", gradient.stride.0, "1", 1))
  let gradient.total_extent.1 = (int64(gradient.extent.1)*int64(gradient.extent.0))
  assert((abs(int64(gradient.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs(int64(gradient.extent.0)), (uint64)2147483647))
  assert((abs((int64(gradient.extent.1)*int64(gradient.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs((int64(gradient.extent.1)*int64(gradient.stride.1))), (uint64)2147483647))
  assert((gradient.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("gradient", gradient.total_extent.1, (int64)2147483647))
  assert((gradient != reinterpret((void *), (uint64)0)), halide_error_host_is_null("Output buffer gradient"))
  produce gradient {
    let t36 = (gradient.min.0 + (gradient.min.1*gradient.stride.1))
    for (gradient.s0.y, gradient.min.1, gradient.extent.1) {
      for (gradient.s0.x, gradient.min.0, gradient.extent.0) {
        gradient[((gradient.s0.x + (gradient.s0.y*gradient.stride.1)) - t36)] = (gradient.s0.x + gradient.s0.y)
      }
    }
  }
}


Splitting off Hexagon offload...
Hexagon device code module: Target = hexagon-32-noos-no_runtime

Module.compile(): stmt_html_name gradient.html
Success!

在終端中執行:

$ export HL_DEBUG_CODEGEN=2
$ ./lesson_03

終端輸入結果:

Realizing Pipeline for arch_unknown-0-os_unknown
jit-compiling for: x86-64-windows-avx-f16c-jit-mingw-sse41
Inferred argument: (void *) __user_context
Creating initial loop nests...
Injecting realization of gradient
for (.__root, 0, 1) {
  produce gradient {
    let gradient.s0.y.loop_max = gradient.s0.y.max
    let gradient.s0.y.loop_min = gradient.s0.y.min
    let gradient.s0.y.loop_extent = ((gradient.s0.y.max + 1) - gradient.s0.y.min)
    let gradient.s0.x.loop_max = gradient.s0.x.max
    let gradient.s0.x.loop_min = gradient.s0.x.min
    let gradient.s0.x.loop_extent = ((gradient.s0.x.max + 1) - gradient.s0.x.min)
    let gradient.s0.__outermost.loop_extent = 1
    let gradient.s0.__outermost.loop_max = 0
    let gradient.s0.__outermost.loop_min = 0
    for (gradient.s0.__outermost, gradient.s0.__outermost.loop_min, gradient.s0.__outermost.loop_extent) {
      for (gradient.s0.y, gradient.s0.y.loop_min, gradient.s0.y.loop_extent) {
        for (gradient.s0.x, gradient.s0.x.loop_min, gradient.s0.x.loop_extent) {
          gradient(gradient.s0.x, gradient.s0.y) = (gradient.s0.x + gradient.s0.y)
        }
      }
    }
  }
}

Lowering after creating initial loop nests:

// ...中間輸出太多,略

Body after wrapping extern calls:
assert((reinterpret(uint64, gradient.buffer) != (uint64)0), halide_error_buffer_argument_is_null("gradient"))
let gradient = _halide_buffer_get_host(gradient.buffer)
let gradient.type.code = _halide_buffer_get_type_code(gradient.buffer)
let gradient.type.bits = _halide_buffer_get_type_bits(gradient.buffer)
let gradient.type.lanes = _halide_buffer_get_type_lanes(gradient.buffer)
let gradient.min.0 = _halide_buffer_get_min(gradient.buffer, 0)
let gradient.extent.0 = _halide_buffer_get_extent(gradient.buffer, 0)
let gradient.stride.0 = _halide_buffer_get_stride(gradient.buffer, 0)
let gradient.min.1 = _halide_buffer_get_min(gradient.buffer, 1)
let gradient.extent.1 = _halide_buffer_get_extent(gradient.buffer, 1)
let gradient.stride.1 = _halide_buffer_get_stride(gradient.buffer, 1)
if (_halide_buffer_is_bounds_query(gradient.buffer)) {
  _halide_buffer_init(gradient.buffer, _halide_buffer_get_shape(gradient.buffer), reinterpret((void *), (uint64)0), (uint64)0, reinterpret((halide_device_interface_t *), (uint64)0), 0, 32, 2, make_struct((halide_dimension_t *), gradient.min.0, gradient.extent.0, 1, 0, gradient.min.1, gradient.extent.1, gradient.extent.0, 0), (uint64)0)
}
if (!_halide_buffer_is_bounds_query(gradient.buffer)) {
  assert((((gradient.type.code == (uint8)0) && (gradient.type.bits == (uint8)32)) && (gradient.type.lanes == (uint16)1)), halide_error_bad_type("Output buffer gradient", gradient.type.code, (uint8)0, gradient.type.bits, (uint8)32, gradient.type.lanes, (uint16)1))
  assert((0 <= gradient.extent.0), halide_error_buffer_extents_negative("Output buffer gradient", 0, gradient.extent.0))
  assert((0 <= gradient.extent.1), halide_error_buffer_extents_negative("Output buffer gradient", 1, gradient.extent.1))
  assert((gradient.stride.0 == 1), halide_error_constraint_violated("gradient.stride.0", gradient.stride.0, "1", 1))
  let gradient.total_extent.1 = (int64(gradient.extent.1)*int64(gradient.extent.0))
  assert((abs(int64(gradient.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs(int64(gradient.extent.0)), (uint64)2147483647))
  assert((abs((int64(gradient.extent.1)*int64(gradient.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("gradient", abs((int64(gradient.extent.1)*int64(gradient.stride.1))), (uint64)2147483647))
  assert((gradient.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("gradient", gradient.total_extent.1, (int64)2147483647))
  assert((gradient != reinterpret((void *), (uint64)0)), halide_error_host_is_null("Output buffer gradient"))
  produce gradient {
    let t36 = (gradient.min.0 + (gradient.min.1*gradient.stride.1))
    for (gradient.s0.y, gradient.min.1, gradient.extent.1) {
      for (gradient.s0.x, gradient.min.0, gradient.extent.0) {
        gradient[((gradient.s0.x + (gradient.s0.y*gradient.stride.1)) - t36)] = (gradient.s0.x + gradient.s0.y)
      }
    }
  }
}


Body after wrapping extern calls:
let gradient.upgraded = (let t38 = make_struct((halide_dimension_t *), 0, 0, 0, 0, 0, 0, 0, 0) in _halide_buffer_init(alloca(size_of_halide_buffer_t()), t38, reinterpret((void *), (uint64)0), (uint64)0, reinterpret((halide_device_interface_t *), (uint64)0), 0, 32, 2, t38, (uint64)0))
let t41 = halide_upgrade_buffer_t("gradient", gradient, gradient.upgraded)
assert((t41 == 0), t41)
let t42 = gradient(gradient.upgraded)
assert((t42 == 0), t42)
if (_halide_buffer_is_bounds_query(gradient.upgraded)) {
  let t39 = halide_downgrade_buffer_t("gradient", gradient.upgraded, gradient)
  assert((t39 == 0), t39)
} else {
  let t40 = halide_downgrade_buffer_t_device_fields("gradient", gradient.upgraded, gradient)
  assert((t40 == 0), t40)
}


Module.compile(): stmt_html_name gradient.html
Success!

我們可以看到,llvm中間結果也打印出來,方便查看和調試

gradient.html內容

這裏寫圖片描述

本節要點:

1. 設置環境變量HL_DEBUG_CODEGEN=1/2,jit即時編譯打印出中間編譯結果,方便調試
2. Func.compile_to_lowered_stmt("gradient.html", {}, HTML), 將Halide中間調度以html形式保存出來,方便閱讀和理解中間調度過程
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章