公開日：2020/03/10
更新日：2020/08/03

【Swift Metal】thread_position_in_grid等の属性について解説

Swift
Metal

thread_position_in_gridやthreads_per_gridって何？
1D、2D、3Dで値がどう変わるの？

こんな悩みを抱えている方、たくさんいらっしゃると思います。
僕自身、MSLの実装を始めたばかりの頃は、属性値をなかなか理解できずに苦労致しました。

本記事では、MSLの属性値についての解説や、実際にどのような値が取れるのかについて記載しております。

１．MSLの属性について
２．1D、2D、3Dにおける属性の値
３．まとめ

MSLの属性について

まずはじめに、MSLにはどのような属性があるのかを紹介致します。以下は、MSLに用意されている属性の中から、GPGPUの実装で使用頻度の高い属性をピックアップしたものです。

属性	データタイプ	概要
thread_execution_width	ushort, uint	ThreadGroupの横幅
thread_position_in_grid	ushort, ushort2, ushort3, uint, uint2, uint3	GridにおけるThreadの位置
thread_position_in_threadgroup	ushort, ushort2, ushort3, uint, uint2, uint3	ThreadGroupにおけるThreadの位置
threadgroup_position_in_grid	ushort, ushort2, ushort3, uint, uint2, uint3	GridにおけるThreadGroupの位置
threadgroups_per_grid	ushort, ushort2, ushort3, uint, uint2, uint3	Grid毎のThreadGroup数
threads_per_grid	ushort, ushort2, ushort3, uint, uint2, uint3	Grid毎のThread数
threads_per_threadgroup	ushort, ushort2, ushort3, uint, uint2, uint3	ThreadGroup毎のThread数

その他の属性に関しましては、Metal Shading Language Specificationの”Table 5.7. Attributes for kernel function input arguments”に記載されておりますので、こちらをご覧いただければと思います。

1D、2D、3Dにおける属性の値

ここからは、実際に属性にはどのような値が格納されているのかをみていきます。
なお、今回みていく属性は、「MSLの属性について」でピックアップした7つの属性に絞っておりますので、予めご了承ください。

1Dの場合

まずは、1D(widthのみ)の実装の場合の値についてです。以下は、1000000の並列処理をGPUで実行するサンプルコードです。

ViewController
Shader.metal

import UIKit
import Metal

class ViewController: UIViewController {

    private let device = MTLCreateSystemDefaultDevice()!
    private var library: MTLLibrary!
    private var commandQueue: MTLCommandQueue!
    private var computePipelineState: MTLComputePipelineState!

    private let xCount = 1000000
    private var outputDataCount = 0

    let attribute = ["thread_execution_width", "thread_position_in_grid", "thread_position_in_threadgroup", "threadgroup_position_in_grid", "threadgroups_per_grid", "threads_per_grid", "threads_per_threadgroup"];

    override func viewDidLoad() {
        super.viewDidLoad()

        outputDataCount = xCount

        initMetal()
        startGPU()
    }

    private func initMetal() {
        guard let libUrl = Bundle.main.url(forResource: "default.metallib", withExtension: nil) else { return }
        library = try! device.makeLibrary(URL: libUrl)

        let function = library.makeFunction(name: "sample")!
        computePipelineState = try! device.makeComputePipelineState(function: function)

        commandQueue = device.makeCommandQueue()!
    }

    private func startGPU() {
        var outputDataList: [[Float]] = []
        for _ in 0..<attribute.count {
            outputDataList.append([Float](repeating: 1, count: outputDataCount))
        }

        var outputBufferList: [MTLBuffer] = []
        for i in 0..<attribute.count {
            outputBufferList.append(device.makeBuffer(bytes: outputDataList[i], length: MemoryLayout<Float>.stride * outputDataList[i].count, options: [])!)
        }

        let commandBuffer = commandQueue.makeCommandBuffer()!
        let computeCommandEncoder = commandBuffer.makeComputeCommandEncoder()!
        computeCommandEncoder.setComputePipelineState(computePipelineState)

        for i in 0..<attribute.count {
            computeCommandEncoder.setBuffer(outputBufferList[i], offset: 0, index: i)
        }
        computeCommandEncoder.setBytes(&outputDataCount, length: MemoryLayout<Float>.size, index: attribute.count)

        let width = computePipelineState.threadExecutionWidth
        let threadgroupsPerGrid = MTLSize(width: (xCount + width - 1) / width, height: 1, depth: 1)
        let threadsPerThreadgroup = MTLSize(width: width, height: 1, depth: 1)
        print("threadgroupsPerGrid  : \(threadgroupsPerGrid)")
        print("threadsPerThreadgroup: \(threadsPerThreadgroup)")
        computeCommandEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)

        computeCommandEncoder.endEncoding()
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()

        var resultDataList: [[Float]] = []
        for i in 0..<attribute.count {
            let result = Data(bytesNoCopy: outputBufferList[i].contents(), count: MemoryLayout<Float>.stride * outputDataList[i].count, deallocator: .none)
            resultDataList.append(
                result.withUnsafeBytes {
                    Array(UnsafeBufferPointer(start: $0.baseAddress!.assumingMemoryBound(to: Float.self ), count: $0.count / MemoryLayout<Float>.size))
                }
            )
        }

        for i in 0..<outputDataCount {
            var str = "index: \(i) ||"
            for j in 0..<attribute.count {
                str.append(" \(attribute[j]): \(resultDataList[j][i]), ")
            }
            print(str)
        }
    }
}

#include <metal_stdlib>
using namespace metal;

kernel void sample(device float* outputData0    [[ buffer(0) ]],
                   device float* outputData1    [[ buffer(1) ]],
                   device float* outputData2    [[ buffer(2) ]],
                   device float* outputData3    [[ buffer(3) ]],
                   device float* outputData4    [[ buffer(4) ]],
                   device float* outputData5    [[ buffer(5) ]],
                   device float* outputData6    [[ buffer(6) ]],
                   const device int& dataLength [[ buffer(7) ]],

                   uint thread_execution_width         [[thread_execution_width]],         // ThreadGroupの横幅
                   uint thread_position_in_grid        [[thread_position_in_grid]],        // GridにおけるThreadの位置
                   uint thread_position_in_threadgroup [[thread_position_in_threadgroup]], // ThreadGroupにおけるThreadの位置
                   uint threadgroup_position_in_grid   [[threadgroup_position_in_grid]],   // GridにおけるThreadGroupの位置
                   uint threadgroups_per_grid          [[threadgroups_per_grid]],          // Grid毎のThreadGroup数
                   uint threads_per_grid               [[threads_per_grid]],               // Grid毎のThread数
                   uint threads_per_threadgroup        [[threads_per_threadgroup]])        // ThreadGroup毎のThread数
{
    int index = thread_position_in_grid;

    if (index > dataLength) { return; }

    outputData0[index]  = thread_execution_width;
    outputData1[index]  = thread_position_in_grid;
    outputData2[index]  = thread_position_in_threadgroup;
    outputData3[index]  = threadgroup_position_in_grid;
    outputData4[index]  = threadgroups_per_grid;
    outputData5[index]  = threads_per_grid;
    outputData6[index]  = threads_per_threadgroup;
}

【スレッドとスレッドグループの値】

threadgroupsPerGrid : MTLSize(width: 31250, height: 1, depth: 1)
threadsPerThreadgroup: MTLSize(width: 32, height: 1, depth: 1)

【出力結果(重要な部分のみピックアップ)】

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	31250.0	1000000.0	32.0
1	32.0	1.0	1.0	0.0	31250.0	1000000.0	32.0
2	32.0	2.0	2.0	0.0	31250.0	1000000.0	32.0
3	32.0	3.0	3.0	0.0	31250.0	1000000.0	32.0
30	32.0	30.0	30.0	0.0	31250.0	1000000.0	32.0
31	32.0	31.0	31.0	0.0	31250.0	1000000.0	32.0
32	32.0	32.0	0.0	1.0	31250.0	1000000.0	32.0
33	32.0	33.0	1.0	1.0	31250.0	1000000.0	32.0
999996	32.0	999996.0	28.0	31249.0	31250.0	1000000.0	32.0
999997	32.0	999997.0	29.0	31249.0	31250.0	1000000.0	32.0
999998	32.0	999998.0	30.0	31249.0	31250.0	1000000.0	32.0
999999	32.0	999999.0	31.0	31249.0	31250.0	1000000.0	32.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、0～999999が出力されました。
GridのWidthが、Threadのwidth(32) * ThreadGroupのwidth(31250) = 1000000 であるため、そのWidth間のThreadは0～999999となるためです。

thread_position_in_threadgroupからは、0～31が繰り返し出力されました。
ThreadGroupのWidthが32であるため、そのWidth間のThreadは0～31となるためです。

threadgroup_position_in_gridからは、32ごとにインクリメントされた値が出力されました。
ThreadGroupのWidthが32であるため、ThreadGroupの位置は、32ごとに切り替わるためです。

threadgroups_per_gridからは、31250が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidthです。

threads_per_gridからは、1000000が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidth(31250)と、threadsPerThreadgroupのwidth(32)をかけた値です。

threads_per_threadgroupからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのwidthです。

2Dの場合

続いて、2D(width * height)の実装の場合の値についてです。
※ 変更箇所の色を変えております。

ViewController
Shader.metal

import UIKit
import Metal

class ViewController: UIViewController {

    private let device = MTLCreateSystemDefaultDevice()!
    private var library: MTLLibrary!
    private var commandQueue: MTLCommandQueue!
    private var computePipelineState: MTLComputePipelineState!

    private let xCount = 1000
    private let yCount = 1000
    private var outputDataCount = 0

    let attribute = ["thread_execution_width", "thread_position_in_grid", "thread_position_in_threadgroup", "threadgroup_position_in_grid", "threadgroups_per_grid", "threads_per_grid", "threads_per_threadgroup"];

    override func viewDidLoad() {
        super.viewDidLoad()

        outputDataCount = xCount * yCount

        initMetal()
        startGPU()
    }

    private func initMetal() {
        guard let libUrl = Bundle.main.url(forResource: "default.metallib", withExtension: nil) else { return }
        library = try! device.makeLibrary(URL: libUrl)

        let function = library.makeFunction(name: "sample")!
        computePipelineState = try! device.makeComputePipelineState(function: function)

        commandQueue = device.makeCommandQueue()!
    }

    private func startGPU() {
        var outputDataList: [[Float]] = []
        for _ in 0..<attribute.count {
            outputDataList.append([Float](repeating: 1, count: outputDataCount))
        }

        var outputBufferList: [MTLBuffer] = []
        for i in 0..<attribute.count {
            outputBufferList.append(device.makeBuffer(bytes: outputDataList[i], length: MemoryLayout<Float>.stride * outputDataList[i].count, options: [])!)
        }

        let commandBuffer = commandQueue.makeCommandBuffer()!
        let computeCommandEncoder = commandBuffer.makeComputeCommandEncoder()!
        computeCommandEncoder.setComputePipelineState(computePipelineState)

        for i in 0..<attribute.count {
            computeCommandEncoder.setBuffer(outputBufferList[i], offset: 0, index: i)
        }
        computeCommandEncoder.setBytes(&outputDataCount, length: MemoryLayout<Float>.size, index: attribute.count)

        let max    = computePipelineState.maxTotalThreadsPerThreadgroup
        let width  = computePipelineState.threadExecutionWidth
        let height = max / width
        let threadgroupsPerGrid = MTLSize(width: (xCount + width - 1) / width, height: (yCount + height - 1) / height, depth: 1)
        let threadsPerThreadgroup = MTLSize(width: width, height: height, depth: 1)
        print("threadgroupsPerGrid  : \(threadgroupsPerGrid)")
        print("threadsPerThreadgroup: \(threadsPerThreadgroup)")
        computeCommandEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)

        computeCommandEncoder.endEncoding()
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()

        var resultDataList: [[Float]] = []
        for i in 0..<attribute.count {
            let result = Data(bytesNoCopy: outputBufferList[i].contents(), count: MemoryLayout<Float>.stride * outputDataList[i].count, deallocator: .none)
            resultDataList.append(
                result.withUnsafeBytes {
                    Array(UnsafeBufferPointer(start: $0.baseAddress!.assumingMemoryBound(to: Float.self ), count: $0.count / MemoryLayout<Float>.size))
                }
            )
        }

        for i in 0..<outputDataCount {
            var str = "index: \(i) ||"
            for j in 0..<attribute.count {
                str.append(" \(attribute[j]): \(resultDataList[j][i]), ")
            }
            print(str)
        }
    }
}

#include <metal_stdlib>
using namespace metal;

kernel void sample(device float* outputData0    [[ buffer(0) ]],
                   device float* outputData1    [[ buffer(1) ]],
                   device float* outputData2    [[ buffer(2) ]],
                   device float* outputData3    [[ buffer(3) ]],
                   device float* outputData4    [[ buffer(4) ]],
                   device float* outputData5    [[ buffer(5) ]],
                   device float* outputData6    [[ buffer(6) ]],
                   const device int& dataLength [[ buffer(7) ]],

                   uint  thread_execution_width         [[thread_execution_width]],         // ThreadGroupの横幅
                   uint2 thread_position_in_grid        [[thread_position_in_grid]],        // GridにおけるThreadの位置
                   uint2 thread_position_in_threadgroup [[thread_position_in_threadgroup]], // ThreadGroupにおけるThreadの位置
                   uint2 threadgroup_position_in_grid   [[threadgroup_position_in_grid]],   // GridにおけるThreadGroupの位置
                   uint2 threadgroups_per_grid          [[threadgroups_per_grid]],          // Grid毎のThreadGroup数
                   uint2 threads_per_grid               [[threads_per_grid]],               // Grid毎のThread数
                   uint2 threads_per_threadgroup        [[threads_per_threadgroup]])        // ThreadGroup毎のThread数
{
    int index = thread_position_in_grid.y * threads_per_grid.x + thread_position_in_grid.x;

    if (index > dataLength) { return; }
    outputData0[index] = thread_execution_width;
    outputData1[index] = thread_position_in_grid.x;
    outputData2[index] = thread_position_in_threadgroup.x;
    outputData3[index] = threadgroup_position_in_grid.x;
    outputData4[index] = threadgroups_per_grid.x;
    outputData5[index] = threads_per_grid.x;
    outputData6[index] = threads_per_threadgroup.x;
		// Y値の出力結果表示する際は、.xの部分を .yに置き換えております。
}

【スレッドとスレッドグループの値】

threadgroupsPerGrid : MTLSize(width: 32, height: 32, depth: 1)
threadsPerThreadgroup: MTLSize(width: 32, height: 32, depth: 1)

【出力結果(重要な部分のみピックアップ)】

・X値

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
1	32.0	1.0	1.0	0.0	32.0	1024.0	32.0
2	32.0	2.0	2.0	0.0	32.0	1024.0	32.0
3	32.0	3.0	3.0	0.0	32.0	1024.0	32.0
30	32.0	30.0	30.0	0.0	32.0	1024.0	32.0
31	32.0	31.0	31.0	0.0	32.0	1024.0	32.0
32	32.0	32.0	0.0	1.0	32.0	1024.0	32.0
33	32.0	33.0	1.0	1.0	32.0	1024.0	32.0
32766	32.0	1022.0	30.0	31.0	32.0	1024.0	32.0
32767	32.0	1023.0	31.0	31.0	32.0	1024.0	32.0
32768	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
32769	32.0	1.0	1.0	0.0	32.0	1024.0	32.0
999996	32.0	572.0	28.0	17.0	32.0	1024.0	32.0
999997	32.0	573.0	29.0	17.0	32.0	1024.0	32.0
999998	32.0	574.0	30.0	17.0	32.0	1024.0	32.0
999999	32.0	575.0	31.0	17.0	32.0	1024.0	32.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、0～1023が出力されました。
GridのWidthが、Threadのwidth(32) * ThreadGroupのwidth(32) = 1024 であるため、そのWidth間のThreadは0～1023となるためです。

thread_position_in_threadgroupからは、0～31が繰り返し出力されました。
ThreadGroupのWidthが32であるため、そのWidth間のThreadは0～31となるためです。

threadgroup_position_in_gridからは、32ごとにインクリメントされた値が出力されました。
ThreadGroupのWidthが32であるため、32ごとにインクリメントされます。なお、GridのWidthが1024であるため、1024ごとに0に戻ります。

threadgroups_per_gridからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidthです。

threads_per_gridからは、1024が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidth(32)と、threadsPerThreadgroupのwidth(32)をかけた値です。

threads_per_threadgroupからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのwidthです。

・Y値

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
1	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
2	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
3	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
1022	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
1023	32.0	0.0	0.0	0.0	32.0	1024.0	32.0
1024	32.0	1.0	1.0	0.0	32.0	1024.0	32.0
1025	32.0	1.0	1.0	0.0	32.0	1024.0	32.0
32766	32.0	31.0	31.0	0.0	32.0	1024.0	32.0
32767	32.0	31.0	31.0	0.0	32.0	1024.0	32.0
32768	32.0	32.0	0.0	1.0	32.0	1024.0	32.0
32769	32.0	32.0	0.0	1.0	32.0	1024.0	32.0
999996	32.0	976.0	16.0	30.0	32.0	1024.0	32.0
999997	32.0	976.0	16.0	30.0	32.0	1024.0	32.0
999998	32.0	976.0	16.0	30.0	32.0	1024.0	32.0
999999	32.0	976.0	16.0	30.0	32.0	1024.0	32.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、1024ごとにインクリメントされた値が出力されました。
GridのWidth(Threadのwidth(32) * ThreadGroupのwidth(32) = 1024)ごとにインクリメントされるためです。

thread_position_in_threadgroupからは、32ごとにインクリメントされた値が出力されました。
ThreadGroupのWidth(32)ごとにインクリメントされるためです。

threadgroup_position_in_gridからは、32768ごとにインクリメントされた値が出力されました。
GridのWidth(1024) * ThreadGroupのheight(32) = 32768ごとにインクリメントされるためです。

threadgroups_per_gridからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのheightです。

threads_per_gridからは、1024が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのheight(32)と、threadsPerThreadgroupのheight(32)をかけた値です。

threads_per_threadgroupからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのheightです。

3Dの場合

続いて、3D(width * height * depth)の実装の場合の値についてです。

ViewController
Shader.metal

import UIKit
import Metal

class ViewController: UIViewController {

    private let device = MTLCreateSystemDefaultDevice()!
    private var library: MTLLibrary!
    private var commandQueue: MTLCommandQueue!
    private var computePipelineState: MTLComputePipelineState!

    private let xCount = 100
    private let yCount = 100
    private let zCount = 100
    private var outputDataCount = 0

    let attribute = ["thread_execution_width", "thread_position_in_grid", "thread_position_in_threadgroup", "threadgroup_position_in_grid", "threadgroups_per_grid", "threads_per_grid", "threads_per_threadgroup"];

    override func viewDidLoad() {
        super.viewDidLoad()

        outputDataCount = xCount * yCount * zCount

        initMetal()

        startGPU()
    }

    private func initMetal() {
        guard let libUrl = Bundle.main.url(forResource: "default.metallib", withExtension: nil) else { return }
        library = try! device.makeLibrary(URL: libUrl)

        let function = library.makeFunction(name: "sample")!
        computePipelineState = try! device.makeComputePipelineState(function: function)

        commandQueue = device.makeCommandQueue()!
    }

    private func startGPU() {
        var outputDataList: [[Float]] = []
        for _ in 0..<attribute.count {
            outputDataList.append([Float](repeating: 1, count: outputDataCount))
        }

        var outputBufferList: [MTLBuffer] = []
        for i in 0..<attribute.count {
            outputBufferList.append(device.makeBuffer(bytes: outputDataList[i], length: MemoryLayout<Float>.stride * outputDataList[i].count, options: [])!)
        }

        let commandBuffer = commandQueue.makeCommandBuffer()!
        let computeCommandEncoder = commandBuffer.makeComputeCommandEncoder()!
        computeCommandEncoder.setComputePipelineState(computePipelineState)

        for i in 0..<attribute.count {
            computeCommandEncoder.setBuffer(outputBufferList[i], offset: 0, index: i)
        }
        computeCommandEncoder.setBytes(&outputDataCount, length: MemoryLayout<Float>.size, index: attribute.count)

        let max    = computePipelineState.maxTotalThreadsPerThreadgroup
        let depth  = 4
        let width  = computePipelineState.threadExecutionWidth
        let height = max / width / depth
        let threadgroupsPerGrid = MTLSize(width: (xCount + width - 1) / width, height: (yCount + height - 1) / height, depth: (zCount + depth - 1) / depth)
        let threadsPerThreadgroup = MTLSize(width: width, height: height, depth: depth)
        print("threadgroupsPerGrid  : \(threadgroupsPerGrid)")
        print("threadsPerThreadgroup: \(threadsPerThreadgroup)")
        computeCommandEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)

        computeCommandEncoder.endEncoding()
        commandBuffer.commit()
        commandBuffer.waitUntilCompleted()

        var resultDataList: [[Float]] = []
        for i in 0..<attribute.count {
            let result = Data(bytesNoCopy: outputBufferList[i].contents(), count: MemoryLayout<Float>.stride * outputDataList[i].count, deallocator: .none)
            resultDataList.append(
                result.withUnsafeBytes {
                    Array(UnsafeBufferPointer(start: $0.baseAddress!.assumingMemoryBound(to: Float.self ), count: $0.count / MemoryLayout<Float>.size))
                }
            )
        }

        for i in 0..<outputDataCount {
            var str = "index: \(i) ||"
            for j in 0..<attribute.count {
                str.append(" \(attribute[j]): \(resultDataList[j][i]), ")
            }
            print(str)
        }
    }
}

#include <metal_stdlib>
using namespace metal;

kernel void sample(device float* outputData0    [[ buffer(0) ]],
                   device float* outputData1    [[ buffer(1) ]],
                   device float* outputData2    [[ buffer(2) ]],
                   device float* outputData3    [[ buffer(3) ]],
                   device float* outputData4    [[ buffer(4) ]],
                   device float* outputData5    [[ buffer(5) ]],
                   device float* outputData6    [[ buffer(6) ]],
                   const device int& dataLength [[ buffer(7) ]],

                   uint  thread_execution_width         [[thread_execution_width]],         // ThreadGroupの横幅
                   uint3 thread_position_in_grid        [[thread_position_in_grid]],        // GridにおけるThreadの位置
                   uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], // ThreadGroupにおけるThreadの位置
                   uint3 threadgroup_position_in_grid   [[threadgroup_position_in_grid]],   // GridにおけるThreadGroupの位置
                   uint3 threadgroups_per_grid          [[threadgroups_per_grid]],          // Grid毎のThreadGroup数
                   uint3 threads_per_grid               [[threads_per_grid]],               // Grid毎のThread数
                   uint3 threads_per_threadgroup        [[threads_per_threadgroup]])        // ThreadGroup毎のThread数
{
    int index = thread_position_in_grid.y * threads_per_grid.x + thread_position_in_grid.x + (thread_position_in_grid.z * threads_per_grid.y * threads_per_grid.x);

    if (index > dataLength) { return; }
    outputData0[index] = thread_execution_width;
    outputData1[index] = thread_position_in_grid.x;
    outputData2[index] = thread_position_in_threadgroup.x;
    outputData3[index] = threadgroup_position_in_grid.x;
    outputData4[index] = threadgroups_per_grid.x;
    outputData5[index] = threads_per_grid.x;
    outputData6[index] = threads_per_threadgroup.x;
		// Y値、Z値の出力結果表示する際は、.xの部分を .y、.zに置き換えております。
}

【スレッドとスレッドグループの値】

threadgroupsPerGrid : MTLSize(width: 4, height: 13, depth: 25)
threadsPerThreadgroup: MTLSize(width: 32, height: 8, depth: 4)

【出力結果(重要な部分のみピックアップ)】

・X値

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	4.0	128.0	32.0
1	32.0	1.0	1.0	0.0	4.0	128.0	32.0
2	32.0	2.0	2.0	0.0	4.0	128.0	32.0
3	32.0	3.0	3.0	0.0	4.0	128.0	32.0
30	32.0	30.0	30.0	0.0	4.0	128.0	32.0
31	32.0	31.0	31.0	0.0	4.0	128.0	32.0
32	32.0	32.0	0.0	1.0	4.0	128.0	32.0
33	32.0	33.0	1.0	1.0	4.0	128.0	32.0
126	32.0	126.0	30.0	3.0	4.0	128.0	32.0
127	32.0	127.0	31.0	3.0	4.0	128.0	32.0
128	32.0	0.0	0.0	0.0	4.0	128.0	32.0
129	32.0	1.0	1.0	0.0	4.0	128.0	32.0
999996	32.0	60.0	28.0	1.0	4.0	128.0	32.0
999997	32.0	61.0	29.0	1.0	4.0	128.0	32.0
999998	32.0	62.0	30.0	1.0	4.0	128.0	32.0
999999	32.0	63.0	31.0	1.0	4.0	128.0	32.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、0～127が出力されました。
GridのWidthが、Threadのwidth(32) * ThreadGroupのwidth(4) = 128 であるため、そのWidth間のThreadは0～128となるためです。

thread_position_in_threadgroupからは、0～31が繰り返し出力されました。
ThreadGroupのWidthが32であるため、そのWidth間のThreadは0～31となるためです。

threadgroup_position_in_gridからは、32ごとにインクリメントされた値が出力されました。
ThreadGroupのWidthが32であるため、32ごとにインクリメントされます。なお、GridのWidthが128であるため、128ごとに0に戻ります。

threadgroups_per_gridからは、4が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidthです。

threads_per_gridからは、128が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのwidth(4)と、threadsPerThreadgroupのwidth(32)をかけた値です。

threads_per_threadgroupからは、32が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのwidthです。

・Y値

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	13.0	104.0	8.0
1	32.0	0.0	0.0	0.0	13.0	104.0	8.0
2	32.0	0.0	0.0	0.0	13.0	104.0	8.0
3	32.0	0.0	0.0	0.0	13.0	104.0	8.0
126	32.0	0.0	0.0	0.0	13.0	104.0	8.0
127	32.0	0.0	0.0	0.0	13.0	104.0	8.0
128	32.0	1.0	1.0	0.0	13.0	104.0	8.0
129	32.0	1.0	1.0	0.0	13.0	104.0	8.0
1022	32.0	7.0	7.0	0.0	13.0	104.0	8.0
1023	32.0	7.0	7.0	0.0	13.0	104.0	8.0
1024	32.0	8.0	0.0	1.0	13.0	104.0	8.0
1025	32.0	8.0	0.0	1.0	13.0	104.0	8.0
13310	32.0	103.0	7.0	12.0	13.0	104.0	8.0
13311	32.0	103.0	7.0	12.0	13.0	104.0	8.0
13312	32.0	0.0	0.0	0.0	13.0	104.0	8.0
13313	32.0	0.0	0.0	0.0	13.0	104.0	8.0
999996	32.0	12.0	4.0	1.0	13.0	104.0	8.0
999997	32.0	12.0	4.0	1.0	13.0	104.0	8.0
999998	32.0	12.0	4.0	1.0	13.0	104.0	8.0
999999	32.0	12.0	4.0	1.0	13.0	104.0	8.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、128ごとにインクリメントされた値が出力されました。
GridのWidth(Threadのwidth(32) * ThreadGroupのwidth(4) = 128)ごとにインクリメントされるためです。
なお、depthが進むタイミング(13312)ごとに、0に戻ります。
※ GridのWidth(32 * 4) * GridのHeight(8 * 13) = depthが進むタイミング(13312)

thread_position_in_threadgroupからは、128ごとにインクリメントされた値が出力されました。
ThreadGroupのWidth(32) * ThreadGroupのDepth(4) = 128ごとにインクリメントされるためです。
なお、ThreadGroupが切り替わるタイミング(32 * 8 * 4 = 1024)ごとに、0に戻ります。

threadgroup_position_in_gridからは、1024ごとにインクリメントされた値が出力されました。
GridのWidth(128) * ThreadGroupのheight(8) = 1024ごとにインクリメントされるためです。

threadgroups_per_gridからは、13が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのheightです。

threads_per_gridからは、104が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのheight(13)と、threadsPerThreadgroupのheight(8)をかけた値です。

threads_per_threadgroupからは、8が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのheightです。

・Z値

index	thread execution width	thread position in grid	thread position in threadgroup	threadgroup position in grid	threadgroups per grid	threads per grid	threads per threadgroup
0	32.0	0.0	0.0	0.0	25.0	100.0	4.0
1	32.0	0.0	0.0	0.0	25.0	100.0	4.0
2	32.0	0.0	0.0	0.0	25.0	100.0	4.0
3	32.0	0.0	0.0	0.0	25.0	100.0	4.0
13310	32.0	0.0	0.0	0.0	25.0	100.0	4.0
13311	32.0	0.0	0.0	0.0	25.0	100.0	4.0
13312	32.0	1.0	1.0	0.0	25.0	100.0	4.0
13313	32.0	1.0	1.0	0.0	25.0	100.0	4.0
53246	32.0	3.0	3.0	0.0	25.0	100.0	4.0
53247	32.0	3.0	3.0	0.0	25.0	100.0	4.0
53248	32.0	4.0	0.0	1.0	25.0	100.0	4.0
53249	32.0	4.0	0.0	1.0	25.0	100.0	4.0
999996	32.0	75.0	3.0	18.0	25.0	100.0	4.0
999997	32.0	75.0	3.0	18.0	25.0	100.0	4.0
999998	32.0	75.0	3.0	18.0	25.0	100.0	4.0
999999	32.0	75.0	3.0	18.0	25.0	100.0	4.0

thread_execution_widthからは、32が出力されました。
これは、Threadのwidthです。

thread_position_in_gridからは、13312ごとにインクリメントされた値が出力されました。
GridのWidth(4 * 32) * GridのHeight(13 * 8) = 13312 ごとにインクリメントされるためです。

thread_position_in_threadgroupからは、32ごとにインクリメントされた値が出力されました。
ThreadGroupのWidth(32)ごとにインクリメントされるためです。

threadgroup_position_in_gridからは、53248ごとにインクリメントされた値が出力されました。
GridのWidth(4 * 32) * GridのHeight(13 * 8) * ThreadGroupのDepth(4) = 53248ごとにインクリメントされるためです。

threadgroups_per_gridからは、25が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのdepthです。

threads_per_gridからは、100が出力されました。
これは、dispatchThreadgroupsで設定した、threadgroupsPerGridのdepth(25)と、threadsPerThreadgroupのdepth(4)をかけた値です。

threads_per_threadgroupからは、4が出力されました。
これは、dispatchThreadgroupsで設定した、threadsPerThreadgroupのdepthです。